diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 03d16fdd54c42..a0c5b15765249 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -16,6 +16,7 @@ #include "AMDGPUTargetTransformInfo.h" #include "AMDGPUTargetMachine.h" +#include "AMDGPUSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIModeRegisterDefaults.h" #include "llvm/Analysis/InlineCost.h" @@ -1574,3 +1575,80 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const { } return BaseT::getNumberOfParts(Tp); } + +InstructionCost GCNTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + StackOffset BaseOffset, + bool HasBaseReg, int64_t Scale, + unsigned AddrSpace) const { + // AMDGPU has limited addressing modes. base+scale*index requires an extra + // ADD instruction, unlike architectures with rich addressing modes. + if (HasBaseReg && Scale != 0) + return 1; + return BaseT::getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale, + AddrSpace); +} + +bool GCNTTIImpl::isLSRCostLess(const TTI::LSRCost &A, + const TTI::LSRCost &B) const { + const GCNSubtarget &ST = *static_cast(getST()); + + // GFX9+: favor lower per-iteration work first; preheader/setup only as tie-breakers. + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS + 1) { + // AMDGPU lacks rich addressing modes; base+scale*index requires separate ADD. + // Include ScaleCost in effective per-iteration instruction count. + unsigned EffInsnsA = A.Insns + A.ScaleCost; + unsigned EffInsnsB = B.Insns + B.ScaleCost; + + // 1) Effective per-iteration instructions (includes addressing complexity). + if (EffInsnsA != EffInsnsB) { + // dbgs() << "MS: EffInsns different, A=" << EffInsnsA << " (Insns=" << A.Insns + // << "+ScaleCost=" << A.ScaleCost << "), B=" << EffInsnsB + // << " (Insns=" << B.Insns << "+ScaleCost=" << B.ScaleCost << ")\n"; + return EffInsnsA < EffInsnsB; + } + + // 2) Strongly prefer fewer IV multiplications (mul/mul_hi/addc chains are costly on AMDGPU). + if (A.NumIVMuls != B.NumIVMuls) { + // dbgs() << "MS: NumIVMuls different, A.NumIVMuls = " << A.NumIVMuls << ", B.NumIVMuls == " << B.NumIVMuls << "\n"; + return A.NumIVMuls < B.NumIVMuls; + } + + // 3) AddRecCost: per-iteration cost of IV updates (fewer IVs = lower cost). + if (A.AddRecCost != B.AddRecCost) { + // dbgs() << "MS: AddRecCost different, A.AddRecCost = " << A.AddRecCost << ", B.AddRecCost == " << B.AddRecCost << "\n"; + return A.AddRecCost < B.AddRecCost; + } + + // 4) Prefer fewer per-iteration base adds as a tie-breaker. + if (A.NumBaseAdds != B.NumBaseAdds) { + // dbgs() << "MS: NumBaseAdds different, A.NumBaseAdds = " << A.NumBaseAdds << ", B.NumBaseAdds == " << B.NumBaseAdds << "\n"; + return A.NumBaseAdds < B.NumBaseAdds; + } + + // 5) Preheader-related costs. + if (A.SetupCost != B.SetupCost) { + // dbgs() << "MS: SetupCost different, A.SetupCost = " << A.SetupCost << ", B.SetupCost == " << B.SetupCost << "\n"; + return A.SetupCost < B.SetupCost; + } + + // 6) Minor keys to stabilize ordering (ImmCost, NumRegs). + // ScaleCost already accounted for in EffInsns, so not compared separately. + if (A.ImmCost != B.ImmCost) { + // dbgs() << "MS: ImmCost different, A.ImmCost = " << A.ImmCost << ", B.ImmCost == " << B.ImmCost << "\n"; + return A.ImmCost < B.ImmCost; + } + + return A.NumRegs < B.NumRegs; + } + + // Pre-GFX9: keep the default behavior. + return BaseT::isLSRCostLess(A, B); +} + +bool GCNTTIImpl::isNumRegsMajorCostOfLSR() { + return false; +} + +bool GCNTTIImpl::shouldDropLSRSolutionIfLessProfitable() const { + return true; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 20da8344c9d37..6bb19e2da5183 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -302,6 +302,16 @@ class GCNTTIImpl final : public BasicTTIImplBase { /// together under a single i32 value. Otherwise fall back to base /// implementation. unsigned getNumberOfParts(Type *Tp) const override; + + InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + StackOffset BaseOffset, + bool HasBaseReg, int64_t Scale, + unsigned AddrSpace) const; + + bool isLSRCostLess(const TTI::LSRCost &A, + const TTI::LSRCost &B) const; + bool isNumRegsMajorCostOfLSR(); + bool shouldDropLSRSolutionIfLessProfitable() const; }; } // end namespace llvm diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index a21db73cf3714..ff5391fc1f107 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -510,21 +510,20 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 { define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg2, i64 %arg3, <2 x half> %arg4, <2 x half> %arg5) #3 { ; GFX908-LABEL: introduced_copy_to_sgpr: ; GFX908: ; %bb.0: ; %bb -; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc +; GFX908-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX908-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX908-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10 ; GFX908-NEXT: s_load_dword s0, s[8:9], 0x18 -; GFX908-NEXT: s_mov_b32 s12, 0 -; GFX908-NEXT: s_mov_b32 s9, s12 +; GFX908-NEXT: s_mov_b32 s8, 0 +; GFX908-NEXT: v_mov_b32_e32 v16, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX908-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX908-NEXT: s_sub_i32 s1, 0, s7 -; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s0 -; GFX908-NEXT: v_mov_b32_e32 v17, 0 -; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX908-NEXT: v_readfirstlane_b32 s2, v0 +; GFX908-NEXT: v_cvt_f32_f16_e32 v14, s0 +; GFX908-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX908-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX908-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX908-NEXT: v_readfirstlane_b32 s2, v1 ; GFX908-NEXT: s_mul_i32 s1, s1, s2 ; GFX908-NEXT: s_mul_hi_u32 s1, s2, s1 ; GFX908-NEXT: s_add_i32 s2, s2, s1 @@ -533,164 +532,154 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_sub_i32 s2, s6, s2 ; GFX908-NEXT: s_add_i32 s3, s1, 1 ; GFX908-NEXT: s_sub_i32 s6, s2, s7 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_readfirstlane_b32 s9, v0 +; GFX908-NEXT: s_and_b32 s18, s9, 0xffff ; GFX908-NEXT: s_cmp_ge_u32 s2, s7 ; GFX908-NEXT: s_cselect_b32 s1, s3, s1 ; GFX908-NEXT: s_cselect_b32 s2, s6, s2 ; GFX908-NEXT: s_add_i32 s3, s1, 1 ; GFX908-NEXT: s_cmp_ge_u32 s2, s7 -; GFX908-NEXT: s_cselect_b32 s8, s3, s1 -; GFX908-NEXT: s_lshr_b32 s2, s0, 16 -; GFX908-NEXT: v_cvt_f32_f16_e32 v19, s2 -; GFX908-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 -; GFX908-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 +; GFX908-NEXT: s_cselect_b32 s19, s3, s1 +; GFX908-NEXT: s_lshr_b32 s0, s0, 16 +; GFX908-NEXT: v_cvt_f32_f16_e32 v15, s0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GFX908-NEXT: s_or_b32 s14, s14, 28 -; GFX908-NEXT: s_lshl_b64 s[16:17], s[8:9], 5 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s2, v16 -; GFX908-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX908-NEXT: s_mul_i32 s3, s5, s2 -; GFX908-NEXT: s_mul_hi_u32 s5, s4, s2 -; GFX908-NEXT: s_mul_i32 s2, s4, s2 -; GFX908-NEXT: s_add_i32 s3, s5, s3 -; GFX908-NEXT: s_lshl_b64 s[4:5], s[2:3], 5 ; GFX908-NEXT: s_branch .LBB3_2 -; GFX908-NEXT: .LBB3_1: ; %Flow20 +; GFX908-NEXT: .LBB3_1: ; %Flow10 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX908-NEXT: s_cbranch_vccz .LBB3_12 +; GFX908-NEXT: s_cbranch_vccz .LBB3_13 ; GFX908-NEXT: .LBB3_2: ; %bb9 ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB3_5 Depth 2 -; GFX908-NEXT: s_mov_b64 s[18:19], -1 +; GFX908-NEXT: s_mov_b64 s[6:7], -1 ; GFX908-NEXT: s_mov_b64 vcc, s[0:1] -; GFX908-NEXT: s_cbranch_vccz .LBB3_10 +; GFX908-NEXT: s_cbranch_vccz .LBB3_11 ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[8:9], v[0:1], off ; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 -; GFX908-NEXT: s_mov_b32 s13, s12 -; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX908-NEXT: v_mov_b32_e32 v4, s12 -; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6 -; GFX908-NEXT: v_mov_b32_e32 v6, s12 -; GFX908-NEXT: v_mov_b32_e32 v8, s12 -; GFX908-NEXT: v_mov_b32_e32 v5, s13 -; GFX908-NEXT: v_mov_b32_e32 v7, s13 -; GFX908-NEXT: v_mov_b32_e32 v9, s13 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 -; GFX908-NEXT: v_mov_b32_e32 v11, v5 -; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15] -; GFX908-NEXT: v_mov_b32_e32 v10, v4 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[6:7], s[10:11], 0 +; GFX908-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[2:3] +; GFX908-NEXT: s_mov_b32 s9, s8 +; GFX908-NEXT: v_mov_b32_e32 v2, s8 +; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, s9 +; GFX908-NEXT: v_mov_b32_e32 v5, s9 +; GFX908-NEXT: v_mov_b32_e32 v7, s9 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s9, v2 -; GFX908-NEXT: v_readfirstlane_b32 s13, v3 -; GFX908-NEXT: s_add_u32 s9, s9, 1 -; GFX908-NEXT: s_addc_u32 s13, s13, 0 -; GFX908-NEXT: s_mul_hi_u32 s22, s6, s9 -; GFX908-NEXT: s_mul_i32 s13, s6, s13 -; GFX908-NEXT: s_mul_i32 s23, s7, s9 -; GFX908-NEXT: s_add_i32 s13, s22, s13 -; GFX908-NEXT: s_mul_i32 s9, s6, s9 -; GFX908-NEXT: s_add_i32 s13, s13, s23 +; GFX908-NEXT: v_readfirstlane_b32 s12, v8 +; GFX908-NEXT: v_readfirstlane_b32 s13, v9 +; GFX908-NEXT: v_mov_b32_e32 v8, s8 +; GFX908-NEXT: v_mov_b32_e32 v9, s9 ; GFX908-NEXT: s_branch .LBB3_5 -; GFX908-NEXT: .LBB3_4: ; %bb58 +; GFX908-NEXT: .LBB3_4: ; %Flow8 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX908-NEXT: s_add_u32 s20, s20, s4 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3] -; GFX908-NEXT: s_addc_u32 s21, s21, s5 -; GFX908-NEXT: s_mov_b64 s[22:23], 0 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25] -; GFX908-NEXT: s_cbranch_vccz .LBB3_9 +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GFX908-NEXT: s_cbranch_vccz .LBB3_10 ; GFX908-NEXT: .LBB3_5: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: s_add_u32 s22, s20, s9 -; GFX908-NEXT: s_addc_u32 s23, s21, s13 -; GFX908-NEXT: global_load_dword v21, v17, s[22:23] offset:-12 glc +; GFX908-NEXT: s_add_u32 s9, s12, 1 +; GFX908-NEXT: s_addc_u32 s14, s13, 0 +; GFX908-NEXT: s_mul_i32 s15, s9, s5 +; GFX908-NEXT: s_mul_hi_u32 s16, s9, s4 +; GFX908-NEXT: s_add_i32 s15, s16, s15 +; GFX908-NEXT: s_mul_i32 s14, s14, s4 +; GFX908-NEXT: s_add_i32 s15, s15, s14 +; GFX908-NEXT: s_mul_i32 s9, s9, s4 +; GFX908-NEXT: s_add_u32 s14, s9, s10 +; GFX908-NEXT: s_addc_u32 s15, s15, s11 +; GFX908-NEXT: s_lshl_b64 s[14:15], s[14:15], 5 +; GFX908-NEXT: global_load_dword v18, v16, s[14:15] offset:16 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v20, v17, s[22:23] offset:-8 glc +; GFX908-NEXT: global_load_dword v17, v16, s[14:15] offset:20 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v17, s[22:23] offset:-4 glc +; GFX908-NEXT: global_load_dword v10, v16, s[14:15] offset:24 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v17, s[22:23] glc +; GFX908-NEXT: global_load_dword v10, v16, s[14:15] offset:28 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: ds_read_b64 v[12:13], v17 -; GFX908-NEXT: ds_read_b64 v[14:15], v0 +; GFX908-NEXT: ds_read_b64 v[10:11], v16 +; GFX908-NEXT: ds_read_b64 v[12:13], v0 ; GFX908-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX908-NEXT: ; kill: killed $sgpr14_sgpr15 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX908-NEXT: ; %bb.6: ; %bb51 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: v_cvt_f32_f16_sdwa v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX908-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GFX908-NEXT: v_cvt_f32_f16_sdwa v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX908-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GFX908-NEXT: v_add_f32_e32 v24, v18, v12 -; GFX908-NEXT: v_add_f32_e32 v25, v19, v13 -; GFX908-NEXT: v_add_f32_e32 v26, 0, v12 -; GFX908-NEXT: v_add_f32_e32 v27, 0, v13 -; GFX908-NEXT: v_add_f32_e32 v15, v22, v15 -; GFX908-NEXT: v_add_f32_e32 v14, v21, v14 -; GFX908-NEXT: v_add_f32_e32 v13, v23, v13 -; GFX908-NEXT: v_add_f32_e32 v12, v20, v12 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v25 -; GFX908-NEXT: v_add_f32_e32 v4, v4, v24 -; GFX908-NEXT: v_add_f32_e32 v7, v7, v27 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v26 -; GFX908-NEXT: v_add_f32_e32 v8, v8, v14 -; GFX908-NEXT: v_add_f32_e32 v9, v9, v15 -; GFX908-NEXT: v_add_f32_e32 v10, v10, v12 -; GFX908-NEXT: v_add_f32_e32 v11, v11, v13 -; GFX908-NEXT: s_branch .LBB3_4 +; GFX908-NEXT: v_cvt_f32_f16_sdwa v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX908-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX908-NEXT: v_cvt_f32_f16_sdwa v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX908-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX908-NEXT: v_add_f32_e32 v21, v14, v10 +; GFX908-NEXT: v_add_f32_e32 v22, v15, v11 +; GFX908-NEXT: v_add_f32_e32 v23, 0, v10 +; GFX908-NEXT: v_add_f32_e32 v24, 0, v11 +; GFX908-NEXT: v_add_f32_e32 v13, v19, v13 +; GFX908-NEXT: v_add_f32_e32 v12, v18, v12 +; GFX908-NEXT: v_add_f32_e32 v11, v20, v11 +; GFX908-NEXT: v_add_f32_e32 v10, v17, v10 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v22 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v21 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v24 +; GFX908-NEXT: v_add_f32_e32 v4, v4, v23 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v12 +; GFX908-NEXT: v_add_f32_e32 v7, v7, v13 +; GFX908-NEXT: v_add_f32_e32 v8, v8, v10 +; GFX908-NEXT: v_add_f32_e32 v9, v9, v11 +; GFX908-NEXT: s_mov_b64 s[16:17], -1 +; GFX908-NEXT: s_branch .LBB3_8 ; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: s_mov_b64 s[22:23], s[18:19] -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23] -; GFX908-NEXT: s_cbranch_vccz .LBB3_4 -; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_mov_b64 s[22:23], -1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX908-NEXT: ; implicit-def: $sgpr20_sgpr21 -; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard +; GFX908-NEXT: s_mov_b64 s[16:17], s[6:7] +; GFX908-NEXT: .LBB3_8: ; %Flow +; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 +; GFX908-NEXT: s_mov_b64 s[14:15], -1 +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GFX908-NEXT: s_mov_b64 s[16:17], -1 +; GFX908-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX908-NEXT: ; %bb.9: ; %bb58 +; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 +; GFX908-NEXT: s_add_u32 s12, s12, s18 +; GFX908-NEXT: s_addc_u32 s13, s13, 0 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[12:13], -1 +; GFX908-NEXT: s_mov_b64 s[14:15], 0 +; GFX908-NEXT: s_branch .LBB3_4 +; GFX908-NEXT: .LBB3_10: ; %loop.exit.guard ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_xor_b64 s[18:19], s[22:23], -1 -; GFX908-NEXT: .LBB3_10: ; %Flow19 +; GFX908-NEXT: s_xor_b64 s[6:7], s[14:15], -1 +; GFX908-NEXT: .LBB3_11: ; %Flow9 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: s_mov_b64 s[2:3], -1 -; GFX908-NEXT: s_and_b64 vcc, exec, s[18:19] +; GFX908-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 -; GFX908-NEXT: ; %bb.11: ; %bb12 +; GFX908-NEXT: ; %bb.12: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_add_u32 s10, s10, s8 +; GFX908-NEXT: s_add_u32 s10, s10, s19 ; GFX908-NEXT: s_addc_u32 s11, s11, 0 -; GFX908-NEXT: s_add_u32 s14, s14, s16 -; GFX908-NEXT: s_addc_u32 s15, s15, s17 ; GFX908-NEXT: s_mov_b64 s[2:3], 0 ; GFX908-NEXT: s_branch .LBB3_1 -; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock +; GFX908-NEXT: .LBB3_13: ; %DummyReturnBlock ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: introduced_copy_to_sgpr: ; GFX90A: ; %bb.0: ; %bb -; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc +; GFX90A-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX90A-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10 ; GFX90A-NEXT: s_load_dword s0, s[8:9], 0x18 ; GFX90A-NEXT: s_mov_b32 s12, 0 -; GFX90A-NEXT: s_mov_b32 s9, s12 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX90A-NEXT: s_sub_i32 s1, 0, s7 -; GFX90A-NEXT: v_mov_b32_e32 v19, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v0 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 ; GFX90A-NEXT: s_mul_i32 s1, s1, s2 ; GFX90A-NEXT: s_mul_hi_u32 s1, s2, s1 @@ -700,132 +689,124 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_sub_i32 s2, s6, s2 ; GFX90A-NEXT: s_add_i32 s3, s1, 1 ; GFX90A-NEXT: s_sub_i32 s6, s2, s7 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: s_and_b32 s18, s8, 0xffff ; GFX90A-NEXT: s_cmp_ge_u32 s2, s7 ; GFX90A-NEXT: s_cselect_b32 s1, s3, s1 ; GFX90A-NEXT: s_cselect_b32 s2, s6, s2 ; GFX90A-NEXT: s_add_i32 s3, s1, 1 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s7 -; GFX90A-NEXT: s_cselect_b32 s8, s3, s1 -; GFX90A-NEXT: s_lshr_b32 s2, s0, 16 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s2 -; GFX90A-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 -; GFX90A-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 +; GFX90A-NEXT: s_cselect_b32 s19, s3, s1 +; GFX90A-NEXT: s_lshr_b32 s1, s0, 16 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s1 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s0 ; GFX90A-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GFX90A-NEXT: s_or_b32 s14, s14, 28 -; GFX90A-NEXT: s_lshl_b64 s[16:17], s[8:9], 5 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s2, v18 -; GFX90A-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX90A-NEXT: s_mul_i32 s3, s5, s2 -; GFX90A-NEXT: s_mul_hi_u32 s5, s4, s2 -; GFX90A-NEXT: s_mul_i32 s2, s4, s2 -; GFX90A-NEXT: s_add_i32 s3, s5, s3 -; GFX90A-NEXT: s_lshl_b64 s[4:5], s[2:3], 5 ; GFX90A-NEXT: s_branch .LBB3_2 -; GFX90A-NEXT: .LBB3_1: ; %Flow20 +; GFX90A-NEXT: .LBB3_1: ; %Flow10 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_vccz .LBB3_12 +; GFX90A-NEXT: s_cbranch_vccz .LBB3_13 ; GFX90A-NEXT: .LBB3_2: ; %bb9 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB3_5 Depth 2 -; GFX90A-NEXT: s_mov_b64 s[18:19], -1 +; GFX90A-NEXT: s_mov_b64 s[6:7], -1 ; GFX90A-NEXT: s_mov_b64 vcc, s[0:1] -; GFX90A-NEXT: s_cbranch_vccz .LBB3_10 +; GFX90A-NEXT: s_cbranch_vccz .LBB3_11 ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[2:3], off ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 ; GFX90A-NEXT: s_mov_b32 s13, s12 -; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[2:3] +; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[6:7], s[10:11], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v8 ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 -; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15] -; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s9, v4 -; GFX90A-NEXT: v_readfirstlane_b32 s13, v5 -; GFX90A-NEXT: s_add_u32 s9, s9, 1 -; GFX90A-NEXT: s_addc_u32 s13, s13, 0 -; GFX90A-NEXT: s_mul_hi_u32 s22, s6, s9 -; GFX90A-NEXT: s_mul_i32 s13, s6, s13 -; GFX90A-NEXT: s_mul_i32 s23, s7, s9 -; GFX90A-NEXT: s_add_i32 s13, s22, s13 -; GFX90A-NEXT: s_mul_i32 s9, s6, s9 -; GFX90A-NEXT: s_add_i32 s13, s13, s23 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v10 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v11 +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] ; GFX90A-NEXT: s_branch .LBB3_5 -; GFX90A-NEXT: .LBB3_4: ; %bb58 +; GFX90A-NEXT: .LBB3_4: ; %Flow8 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: s_add_u32 s20, s20, s4 -; GFX90A-NEXT: s_addc_u32 s21, s21, s5 -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5] -; GFX90A-NEXT: s_mov_b64 s[22:23], 0 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25] -; GFX90A-NEXT: s_cbranch_vccz .LBB3_9 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GFX90A-NEXT: s_cbranch_vccz .LBB3_10 ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: s_add_u32 s22, s20, s9 -; GFX90A-NEXT: s_addc_u32 s23, s21, s13 -; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc +; GFX90A-NEXT: s_add_u32 s13, s8, 1 +; GFX90A-NEXT: s_addc_u32 s14, s9, 0 +; GFX90A-NEXT: s_mul_i32 s15, s13, s5 +; GFX90A-NEXT: s_mul_hi_u32 s16, s13, s4 +; GFX90A-NEXT: s_add_i32 s15, s16, s15 +; GFX90A-NEXT: s_mul_i32 s14, s14, s4 +; GFX90A-NEXT: s_add_i32 s15, s15, s14 +; GFX90A-NEXT: s_mul_i32 s13, s13, s4 +; GFX90A-NEXT: s_add_u32 s14, s13, s10 +; GFX90A-NEXT: s_addc_u32 s15, s15, s11 +; GFX90A-NEXT: s_lshl_b64 s[14:15], s[14:15], 5 +; GFX90A-NEXT: global_load_dword v18, v16, s[14:15] offset:16 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc +; GFX90A-NEXT: global_load_dword v17, v16, s[14:15] offset:20 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] offset:-4 glc +; GFX90A-NEXT: global_load_dword v12, v16, s[14:15] offset:24 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] glc +; GFX90A-NEXT: global_load_dword v12, v16, s[14:15] offset:28 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: ds_read_b64 v[14:15], v19 -; GFX90A-NEXT: ds_read_b64 v[16:17], v0 +; GFX90A-NEXT: ds_read_b64 v[12:13], v16 +; GFX90A-NEXT: ds_read_b64 v[14:15], v0 ; GFX90A-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX90A-NEXT: ; kill: killed $sgpr22 killed $sgpr23 +; GFX90A-NEXT: ; kill: killed $sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX90A-NEXT: ; %bb.6: ; %bb51 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: v_cvt_f32_f16_sdwa v23, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v22, v21 -; GFX90A-NEXT: v_cvt_f32_f16_sdwa v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GFX90A-NEXT: v_pk_add_f32 v[24:25], v[0:1], v[14:15] -; GFX90A-NEXT: v_pk_add_f32 v[26:27], v[14:15], 0 op_sel_hi:[1,0] -; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[22:23], v[16:17] -; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[20:21], v[14:15] +; GFX90A-NEXT: v_cvt_f32_f16_sdwa v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX90A-NEXT: v_cvt_f32_f16_sdwa v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v20, v17 +; GFX90A-NEXT: v_pk_add_f32 v[22:23], v[0:1], v[12:13] +; GFX90A-NEXT: v_pk_add_f32 v[24:25], v[12:13], 0 op_sel_hi:[1,0] +; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[18:19], v[14:15] +; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[20:21], v[12:13] +; GFX90A-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[22:23] ; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[24:25] -; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[26:27] -; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[16:17] -; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15] -; GFX90A-NEXT: s_branch .LBB3_4 +; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[14:15] +; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[12:13] +; GFX90A-NEXT: s_mov_b64 s[16:17], -1 +; GFX90A-NEXT: s_branch .LBB3_8 ; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: s_mov_b64 s[22:23], s[18:19] -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23] -; GFX90A-NEXT: s_cbranch_vccz .LBB3_4 -; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_mov_b64 s[22:23], -1 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $sgpr20_sgpr21 -; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard +; GFX90A-NEXT: s_mov_b64 s[16:17], s[6:7] +; GFX90A-NEXT: .LBB3_8: ; %Flow +; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 +; GFX90A-NEXT: s_mov_b64 s[14:15], -1 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GFX90A-NEXT: s_mov_b64 s[16:17], -1 +; GFX90A-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX90A-NEXT: ; %bb.9: ; %bb58 +; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 +; GFX90A-NEXT: s_add_u32 s8, s8, s18 +; GFX90A-NEXT: s_addc_u32 s9, s9, 0 +; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[8:9], -1 +; GFX90A-NEXT: s_mov_b64 s[14:15], 0 +; GFX90A-NEXT: s_branch .LBB3_4 +; GFX90A-NEXT: .LBB3_10: ; %loop.exit.guard ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_xor_b64 s[18:19], s[22:23], -1 -; GFX90A-NEXT: .LBB3_10: ; %Flow19 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[14:15], -1 +; GFX90A-NEXT: .LBB3_11: ; %Flow9 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: s_mov_b64 s[2:3], -1 -; GFX90A-NEXT: s_and_b64 vcc, exec, s[18:19] +; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 -; GFX90A-NEXT: ; %bb.11: ; %bb12 +; GFX90A-NEXT: ; %bb.12: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_add_u32 s10, s10, s8 +; GFX90A-NEXT: s_add_u32 s10, s10, s19 ; GFX90A-NEXT: s_addc_u32 s11, s11, 0 -; GFX90A-NEXT: s_add_u32 s14, s14, s16 -; GFX90A-NEXT: s_addc_u32 s15, s15, s17 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: s_branch .LBB3_1 -; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock +; GFX90A-NEXT: .LBB3_13: ; %DummyReturnBlock ; GFX90A-NEXT: s_endpgm bb: %i = load volatile i16, ptr addrspace(4) poison, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll index 931a14473c340..f5223d5553c6a 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll @@ -17,14 +17,13 @@ define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a, ; GFX7-NEXT: s_add_u32 s12, s12, s11 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_mov_b32 s0, 0 -; GFX7-NEXT: s_mov_b32 s1, 0 ; GFX7-NEXT: .LBB0_1: ; %loop ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_add_i32 s1, s1, 1 -; GFX7-NEXT: s_add_i32 s0, s0, 4 -; GFX7-NEXT: s_cmp_lt_u32 s1, 16 +; GFX7-NEXT: s_lshl_b32 s1, s0, 2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_i32 s0, s0, 1 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_cmp_lt_u32 s0, 16 ; GFX7-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GFX7-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX7-NEXT: ; %bb.2: ; %done @@ -45,14 +44,13 @@ define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a, ; GFX8-NEXT: s_add_u32 s88, s88, s11 ; GFX8-NEXT: s_addc_u32 s89, s89, 0 ; GFX8-NEXT: s_mov_b32 s0, 0 -; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: .LBB0_1: ; %loop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: s_add_i32 s1, s1, 1 -; GFX8-NEXT: s_add_i32 s0, s0, 4 -; GFX8-NEXT: s_cmp_lt_u32 s1, 16 +; GFX8-NEXT: s_lshl_b32 s1, s0, 2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_add_i32 s0, s0, 1 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_cmp_lt_u32 s0, 16 ; GFX8-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %done diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll index 31344c78990b8..c27a12f4588ee 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -2104,36 +2104,36 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) { ; GFX1250-SDAG-LABEL: flat_addr_64bit_lsr_iv: ; GFX1250-SDAG: ; %bb.0: ; %bb ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_movk_i32 s0, 0x100 ; GFX1250-SDAG-NEXT: .LBB116_1: ; %bb3 ; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] -; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 +; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s0, -1 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4 +; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0 ; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB116_1 ; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv: ; GFX1250-GISEL: ; %bb.0: ; %bb -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-GISEL-NEXT: s_movk_i32 s0, 0x100 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-GISEL-NEXT: .LBB116_1: ; %bb3 ; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo -; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3] -; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, s0, v0, 4 +; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB116_1 ; GFX1250-GISEL-NEXT: ; %bb.2: ; %bb2 ; GFX1250-GISEL-NEXT: s_endpgm @@ -2159,42 +2159,40 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre ; GFX1250-SDAG-LABEL: flat_addr_64bit_lsr_iv_multiload: ; GFX1250-SDAG: ; %bb.0: ; %bb ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_movk_i32 s0, 0x100 ; GFX1250-SDAG-NEXT: .LBB117_1: ; %bb3 ; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] -; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 -; GFX1250-SDAG-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s0, -1 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4 +; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0 ; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB117_1 ; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv_multiload: ; GFX1250-GISEL: ; %bb.0: ; %bb -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-GISEL-NEXT: s_movk_i32 s0, 0x100 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3 ; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo -; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3] -; GFX1250-GISEL-NEXT: ; kill: killed $vgpr4 killed $vgpr5 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, s0, v0, 4 +; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB117_1 ; GFX1250-GISEL-NEXT: ; %bb.2: ; %bb2 ; GFX1250-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 9ebf6ae88a517..a405f7888423f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -4722,17 +4722,16 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr a define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX9-LABEL: global_addr_64bit_lsr_iv: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_movk_i32 s0, 0x100 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: .LBB132_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_add_u32 s4, s2, s0 -; GFX9-NEXT: s_addc_u32 s5, s3, s1 -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s0, 4 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400 +; GFX9-NEXT: s_add_i32 s0, s0, -1 +; GFX9-NEXT: s_add_u32 s2, s2, 4 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB132_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -4740,17 +4739,16 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX10-LABEL: global_addr_64bit_lsr_iv: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_movk_i32 s0, 0x100 ; GFX10-NEXT: .LBB132_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_add_u32 s4, s2, s0 -; GFX10-NEXT: s_addc_u32 s5, s3, s1 -; GFX10-NEXT: s_add_u32 s0, s0, 4 -; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400 +; GFX10-NEXT: s_add_i32 s0, s0, -1 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 s2, s2, 4 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB132_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -4758,17 +4756,15 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX11-LABEL: global_addr_64bit_lsr_iv: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: s_movk_i32 s0, 0x100 ; GFX11-NEXT: .LBB132_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_u32 s4, s2, s0 -; GFX11-NEXT: s_addc_u32 s5, s3, s1 -; GFX11-NEXT: s_add_u32 s0, s0, 4 -; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_add_i32 s0, s0, -1 +; GFX11-NEXT: s_add_u32 s2, s2, 4 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB132_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm @@ -4776,38 +4772,34 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX12-SDAG-LABEL: global_addr_64bit_lsr_iv: ; GFX12-SDAG: ; %bb.0: ; %bb ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x100 ; GFX12-SDAG-NEXT: .LBB132_1: ; %bb3 ; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 -; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -1 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4 +; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0 ; GFX12-SDAG-NEXT: s_cbranch_scc0 .LBB132_1 ; GFX12-SDAG-NEXT: ; %bb.2: ; %bb2 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x100 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffd -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc -; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffd -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc -; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off scope:SCOPE_SYS +; GFX12-GISEL-NEXT: global_load_b32 v3, v[0:1], off scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0x400, v2 +; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, s[0:1], v0, 4 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX12-GISEL-NEXT: s_cbranch_vccz .LBB132_1 ; GFX12-GISEL-NEXT: ; %bb.2: ; %bb2 ; GFX12-GISEL-NEXT: s_endpgm @@ -4832,20 +4824,18 @@ bb3: ; preds = %bb3, %bb define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg %arg, ptr addrspace(1) inreg %arg.1) { ; GFX9-LABEL: global_addr_64bit_lsr_iv_multiload: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_movk_i32 s0, 0x100 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: .LBB133_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_add_u32 s4, s2, s0 -; GFX9-NEXT: s_addc_u32 s5, s3, s1 -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s0, 4 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400 -; GFX9-NEXT: ; kill: killed $sgpr4 killed $sgpr5 +; GFX9-NEXT: s_add_i32 s0, s0, -1 +; GFX9-NEXT: s_add_u32 s2, s2, 4 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB133_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -4853,20 +4843,18 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX10-LABEL: global_addr_64bit_lsr_iv_multiload: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_movk_i32 s0, 0x100 ; GFX10-NEXT: .LBB133_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_add_u32 s4, s2, s0 -; GFX10-NEXT: s_addc_u32 s5, s3, s1 -; GFX10-NEXT: s_add_u32 s0, s0, 4 -; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400 -; GFX10-NEXT: ; kill: killed $sgpr4 killed $sgpr5 +; GFX10-NEXT: s_add_i32 s0, s0, -1 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 s2, s2, 4 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB133_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -4874,19 +4862,17 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX11-LABEL: global_addr_64bit_lsr_iv_multiload: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: s_movk_i32 s0, 0x100 ; GFX11-NEXT: .LBB133_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_u32 s4, s2, s0 -; GFX11-NEXT: s_addc_u32 s5, s3, s1 -; GFX11-NEXT: s_add_u32 s0, s0, 4 -; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400 +; GFX11-NEXT: s_add_i32 s0, s0, -1 +; GFX11-NEXT: s_add_u32 s2, s2, 4 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB133_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm @@ -4894,42 +4880,38 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX12-SDAG-LABEL: global_addr_64bit_lsr_iv_multiload: ; GFX12-SDAG: ; %bb.0: ; %bb ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x100 ; GFX12-SDAG-NEXT: .LBB133_1: ; %bb3 ; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 -; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -1 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4 +; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0 ; GFX12-SDAG-NEXT: s_cbranch_scc0 .LBB133_1 ; GFX12-SDAG-NEXT: ; %bb.2: ; %bb2 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv_multiload: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x100 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-GISEL-NEXT: .LBB133_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffd -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc -; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffd -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc -; GFX12-GISEL-NEXT: global_load_b32 v6, v[4:5], off scope:SCOPE_SYS +; GFX12-GISEL-NEXT: global_load_b32 v3, v[0:1], off scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off scope:SCOPE_SYS +; GFX12-GISEL-NEXT: global_load_b32 v3, v[0:1], off scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0x400, v2 +; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, s[0:1], v0, 4 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX12-GISEL-NEXT: s_cbranch_vccz .LBB133_1 ; GFX12-GISEL-NEXT: ; %bb.2: ; %bb2 ; GFX12-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 835818fb2fd15..4b524481b38c1 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -8,44 +8,39 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_sub_i32 s4, 0, s6 +; GFX9-NEXT: s_sub_i32 s2, 0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v1 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s8, s5, s4 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_mul_i32 s2, s2, s4 +; GFX9-NEXT: s_mul_hi_u32 s2, s4, s2 +; GFX9-NEXT: s_add_i32 s4, s4, s2 +; GFX9-NEXT: s_mov_b32 s2, s3 ; GFX9-NEXT: .LBB0_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_not_b32 s10, s5 -; GFX9-NEXT: s_mul_i32 s9, s6, s5 -; GFX9-NEXT: s_mul_i32 s10, s6, s10 -; GFX9-NEXT: s_add_i32 s11, s5, 1 -; GFX9-NEXT: s_sub_i32 s9, s7, s9 -; GFX9-NEXT: s_add_i32 s10, s7, s10 -; GFX9-NEXT: s_cmp_ge_u32 s9, s6 -; GFX9-NEXT: s_cselect_b32 s11, s11, s5 -; GFX9-NEXT: s_cselect_b32 s9, s10, s9 -; GFX9-NEXT: s_add_i32 s10, s11, 1 -; GFX9-NEXT: s_cmp_ge_u32 s9, s6 -; GFX9-NEXT: s_cselect_b32 s9, s10, s11 -; GFX9-NEXT: s_add_u32 s10, s0, s2 -; GFX9-NEXT: s_addc_u32 s11, s1, s3 -; GFX9-NEXT: s_add_i32 s7, s7, 1 -; GFX9-NEXT: s_add_u32 s4, s4, s8 -; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_add_u32 s2, s2, 4 -; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000 -; GFX9-NEXT: global_store_dword v0, v1, s[10:11] +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s4 +; GFX9-NEXT: s_mul_i32 s7, s5, s6 +; GFX9-NEXT: s_sub_i32 s7, s2, s7 +; GFX9-NEXT: s_add_i32 s8, s5, 1 +; GFX9-NEXT: s_sub_i32 s9, s7, s6 +; GFX9-NEXT: s_cmp_ge_u32 s7, s6 +; GFX9-NEXT: s_cselect_b32 s5, s8, s5 +; GFX9-NEXT: s_cselect_b32 s7, s9, s7 +; GFX9-NEXT: s_add_i32 s8, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s7, s6 +; GFX9-NEXT: s_cselect_b32 s5, s8, s5 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s8, s0, s8 +; GFX9-NEXT: s_addc_u32 s9, s1, s9 +; GFX9-NEXT: s_add_i32 s2, s2, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -55,45 +50,40 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX10-NEXT: s_sub_i32 s2, 0, s6 +; GFX10-NEXT: s_sub_i32 s3, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mul_i32 s2, s2, s4 -; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2 -; GFX10-NEXT: s_mov_b64 s[2:3], 0 -; GFX10-NEXT: s_add_i32 s8, s4, s5 -; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: s_mul_i32 s3, s3, s2 +; GFX10-NEXT: s_mul_hi_u32 s4, s2, s3 +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_add_i32 s4, s2, s4 +; GFX10-NEXT: s_mov_b32 s2, s3 ; GFX10-NEXT: .LBB0_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_mul_hi_u32 s5, s2, s4 +; GFX10-NEXT: s_mul_i32 s7, s5, s6 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_not_b32 s10, s5 -; GFX10-NEXT: s_mul_i32 s9, s6, s5 -; GFX10-NEXT: s_mul_i32 s10, s6, s10 -; GFX10-NEXT: s_sub_i32 s9, s7, s9 -; GFX10-NEXT: s_add_i32 s11, s5, 1 -; GFX10-NEXT: s_add_i32 s10, s7, s10 -; GFX10-NEXT: s_cmp_ge_u32 s9, s6 -; GFX10-NEXT: s_cselect_b32 s11, s11, s5 -; GFX10-NEXT: s_cselect_b32 s9, s10, s9 -; GFX10-NEXT: s_add_i32 s10, s11, 1 -; GFX10-NEXT: s_cmp_ge_u32 s9, s6 -; GFX10-NEXT: s_cselect_b32 s9, s10, s11 -; GFX10-NEXT: s_add_u32 s10, s0, s2 -; GFX10-NEXT: s_addc_u32 s11, s1, s3 -; GFX10-NEXT: s_add_i32 s7, s7, 1 -; GFX10-NEXT: s_add_u32 s4, s4, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_addc_u32 s5, s5, 0 -; GFX10-NEXT: s_add_u32 s2, s2, 4 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000 -; GFX10-NEXT: global_store_dword v0, v1, s[10:11] +; GFX10-NEXT: s_add_i32 s8, s5, 1 +; GFX10-NEXT: s_sub_i32 s7, s2, s7 +; GFX10-NEXT: s_sub_i32 s9, s7, s6 +; GFX10-NEXT: s_cmp_ge_u32 s7, s6 +; GFX10-NEXT: s_cselect_b32 s5, s8, s5 +; GFX10-NEXT: s_cselect_b32 s7, s9, s7 +; GFX10-NEXT: s_add_i32 s8, s5, 1 +; GFX10-NEXT: s_cmp_ge_u32 s7, s6 +; GFX10-NEXT: s_cselect_b32 s5, s8, s5 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], 2 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_add_u32 s8, s0, s8 +; GFX10-NEXT: s_addc_u32 s9, s1, s9 +; GFX10-NEXT: s_add_i32 s2, s2, 1 +; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] ; GFX10-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -103,49 +93,46 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s7, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX11-NEXT: s_sub_i32 s2, 0, s6 +; GFX11-NEXT: s_sub_i32 s3, 0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_mul_i32 s2, s2, s4 -; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2 -; GFX11-NEXT: s_mov_b64 s[2:3], 0 -; GFX11-NEXT: s_add_i32 s8, s4, s5 -; GFX11-NEXT: s_mov_b64 s[4:5], 0 +; GFX11-NEXT: s_mul_i32 s3, s3, s2 +; GFX11-NEXT: s_mul_hi_u32 s4, s2, s3 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_add_i32 s4, s2, s4 +; GFX11-NEXT: s_mov_b32 s2, s3 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB0_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_hi_u32 s5, s2, s4 +; GFX11-NEXT: s_mul_i32 s7, s5, s6 +; GFX11-NEXT: s_add_i32 s8, s5, 1 +; GFX11-NEXT: s_sub_i32 s7, s2, s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_i32 s9, s7, s6 +; GFX11-NEXT: s_cmp_ge_u32 s7, s6 +; GFX11-NEXT: s_cselect_b32 s5, s8, s5 +; GFX11-NEXT: s_cselect_b32 s7, s9, s7 +; GFX11-NEXT: s_add_i32 s8, s5, 1 +; GFX11-NEXT: s_cmp_ge_u32 s7, s6 +; GFX11-NEXT: s_cselect_b32 s5, s8, s5 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-NEXT: s_add_u32 s8, s0, s8 +; GFX11-NEXT: s_addc_u32 s9, s1, s9 +; GFX11-NEXT: s_add_i32 s2, s2, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s10, s5 -; GFX11-NEXT: s_mul_i32 s9, s6, s5 -; GFX11-NEXT: s_mul_i32 s10, s6, s10 -; GFX11-NEXT: s_sub_i32 s9, s7, s9 -; GFX11-NEXT: s_add_i32 s11, s5, 1 -; GFX11-NEXT: s_add_i32 s10, s7, s10 -; GFX11-NEXT: s_cmp_ge_u32 s9, s6 -; GFX11-NEXT: s_cselect_b32 s11, s11, s5 -; GFX11-NEXT: s_cselect_b32 s9, s10, s9 -; GFX11-NEXT: s_add_i32 s10, s11, 1 -; GFX11-NEXT: s_cmp_ge_u32 s9, s6 -; GFX11-NEXT: s_cselect_b32 s9, s10, s11 -; GFX11-NEXT: s_add_u32 s10, s0, s2 -; GFX11-NEXT: s_addc_u32 s11, s1, s3 -; GFX11-NEXT: s_add_i32 s7, s7, 1 -; GFX11-NEXT: s_add_u32 s4, s4, s8 -; GFX11-NEXT: v_mov_b32_e32 v1, s9 -; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_add_u32 s2, s2, 4 -; GFX11-NEXT: s_addc_u32 s3, s3, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000 -; GFX11-NEXT: global_store_b32 v0, v1, s[10:11] +; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX11-NEXT: global_store_b32 v0, v1, s[8:9] ; GFX11-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm @@ -171,42 +158,37 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_sub_i32 s4, 0, s6 +; GFX9-NEXT: s_sub_i32 s2, 0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v1 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s8, s5, s4 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_mul_i32 s2, s2, s4 +; GFX9-NEXT: s_mul_hi_u32 s2, s4, s2 +; GFX9-NEXT: s_add_i32 s4, s4, s2 +; GFX9-NEXT: s_mov_b32 s2, s3 ; GFX9-NEXT: .LBB1_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_not_b32 s10, s5 -; GFX9-NEXT: s_mul_i32 s9, s6, s5 -; GFX9-NEXT: s_mul_i32 s10, s6, s10 -; GFX9-NEXT: s_sub_i32 s9, s7, s9 -; GFX9-NEXT: s_add_i32 s10, s7, s10 -; GFX9-NEXT: s_cmp_ge_u32 s9, s6 -; GFX9-NEXT: s_cselect_b32 s9, s10, s9 -; GFX9-NEXT: s_sub_i32 s10, s9, s6 -; GFX9-NEXT: s_cmp_ge_u32 s9, s6 -; GFX9-NEXT: s_cselect_b32 s9, s10, s9 -; GFX9-NEXT: s_add_u32 s10, s0, s2 -; GFX9-NEXT: s_addc_u32 s11, s1, s3 -; GFX9-NEXT: s_add_i32 s7, s7, 1 -; GFX9-NEXT: s_add_u32 s4, s4, s8 -; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_add_u32 s2, s2, 4 -; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000 -; GFX9-NEXT: global_store_dword v0, v1, s[10:11] +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s4 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_sub_i32 s5, s2, s5 +; GFX9-NEXT: s_sub_i32 s7, s5, s6 +; GFX9-NEXT: s_cmp_ge_u32 s5, s6 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_sub_i32 s7, s5, s6 +; GFX9-NEXT: s_cmp_ge_u32 s5, s6 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s8, s0, s8 +; GFX9-NEXT: s_addc_u32 s9, s1, s9 +; GFX9-NEXT: s_add_i32 s2, s2, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -216,43 +198,38 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX10-NEXT: s_sub_i32 s2, 0, s6 +; GFX10-NEXT: s_sub_i32 s3, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mul_i32 s2, s2, s4 -; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2 -; GFX10-NEXT: s_mov_b64 s[2:3], 0 -; GFX10-NEXT: s_add_i32 s8, s4, s5 -; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: s_mul_i32 s3, s3, s2 +; GFX10-NEXT: s_mul_hi_u32 s4, s2, s3 +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_add_i32 s4, s2, s4 +; GFX10-NEXT: s_mov_b32 s2, s3 ; GFX10-NEXT: .LBB1_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_not_b32 s9, s5 +; GFX10-NEXT: s_mul_hi_u32 s5, s2, s4 +; GFX10-NEXT: s_mul_i32 s5, s5, s6 +; GFX10-NEXT: s_sub_i32 s5, s2, s5 +; GFX10-NEXT: s_sub_i32 s7, s5, s6 +; GFX10-NEXT: s_cmp_ge_u32 s5, s6 +; GFX10-NEXT: s_cselect_b32 s5, s7, s5 +; GFX10-NEXT: s_sub_i32 s7, s5, s6 +; GFX10-NEXT: s_cmp_ge_u32 s5, s6 +; GFX10-NEXT: s_cselect_b32 s5, s7, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mul_i32 s10, s6, s5 -; GFX10-NEXT: s_mul_i32 s9, s6, s9 -; GFX10-NEXT: s_sub_i32 s10, s7, s10 -; GFX10-NEXT: s_add_i32 s9, s7, s9 -; GFX10-NEXT: s_cmp_ge_u32 s10, s6 -; GFX10-NEXT: s_cselect_b32 s9, s9, s10 -; GFX10-NEXT: s_sub_i32 s10, s9, s6 -; GFX10-NEXT: s_cmp_ge_u32 s9, s6 -; GFX10-NEXT: s_cselect_b32 s9, s10, s9 -; GFX10-NEXT: s_add_u32 s10, s0, s2 -; GFX10-NEXT: s_addc_u32 s11, s1, s3 -; GFX10-NEXT: s_add_i32 s7, s7, 1 -; GFX10-NEXT: s_add_u32 s4, s4, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_addc_u32 s5, s5, 0 -; GFX10-NEXT: s_add_u32 s2, s2, 4 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000 -; GFX10-NEXT: global_store_dword v0, v1, s[10:11] +; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], 2 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_add_u32 s8, s0, s8 +; GFX10-NEXT: s_addc_u32 s9, s1, s9 +; GFX10-NEXT: s_add_i32 s2, s2, 1 +; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] ; GFX10-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -262,48 +239,45 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s7, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX11-NEXT: s_sub_i32 s2, 0, s6 +; GFX11-NEXT: s_sub_i32 s3, 0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_mul_i32 s2, s2, s4 -; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2 -; GFX11-NEXT: s_mov_b64 s[2:3], 0 -; GFX11-NEXT: s_add_i32 s8, s4, s5 -; GFX11-NEXT: s_mov_b64 s[4:5], 0 +; GFX11-NEXT: s_mul_i32 s3, s3, s2 +; GFX11-NEXT: s_mul_hi_u32 s4, s2, s3 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_add_i32 s4, s2, s4 +; GFX11-NEXT: s_mov_b32 s2, s3 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB1_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_hi_u32 s5, s2, s4 +; GFX11-NEXT: s_mul_i32 s5, s5, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_i32 s5, s2, s5 +; GFX11-NEXT: s_sub_i32 s7, s5, s6 +; GFX11-NEXT: s_cmp_ge_u32 s5, s6 +; GFX11-NEXT: s_cselect_b32 s5, s7, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s9, s5 -; GFX11-NEXT: s_mul_i32 s10, s6, s5 -; GFX11-NEXT: s_mul_i32 s9, s6, s9 -; GFX11-NEXT: s_sub_i32 s10, s7, s10 -; GFX11-NEXT: s_add_i32 s9, s7, s9 -; GFX11-NEXT: s_cmp_ge_u32 s10, s6 -; GFX11-NEXT: s_cselect_b32 s9, s9, s10 +; GFX11-NEXT: s_sub_i32 s7, s5, s6 +; GFX11-NEXT: s_cmp_ge_u32 s5, s6 +; GFX11-NEXT: s_cselect_b32 s5, s7, s5 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-NEXT: s_add_u32 s8, s0, s8 +; GFX11-NEXT: s_addc_u32 s9, s1, s9 +; GFX11-NEXT: s_add_i32 s2, s2, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s10, s9, s6 -; GFX11-NEXT: s_cmp_ge_u32 s9, s6 -; GFX11-NEXT: s_cselect_b32 s9, s10, s9 -; GFX11-NEXT: s_add_u32 s10, s0, s2 -; GFX11-NEXT: s_addc_u32 s11, s1, s3 -; GFX11-NEXT: s_add_i32 s7, s7, 1 -; GFX11-NEXT: s_add_u32 s4, s4, s8 -; GFX11-NEXT: v_mov_b32_e32 v1, s9 -; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_add_u32 s2, s2, 4 -; GFX11-NEXT: s_addc_u32 s3, s3, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000 -; GFX11-NEXT: global_store_b32 v0, v1, s[10:11] +; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX11-NEXT: global_store_b32 v0, v1, s[8:9] ; GFX11-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll index 34a9624cb19eb..e8a7a11afda0d 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll @@ -19,11 +19,10 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: .LBB0_1: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; CHECK-NEXT: v_add_nc_u32_e32 v3, -4, v3 ; CHECK-NEXT: .LBB0_2: ; %Flow1 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 -; CHECK-NEXT: v_cmp_ne_u32_e64 s5, 0, v1 +; CHECK-NEXT: v_cmp_ne_u32_e64 s5, 0, v3 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; j lastloop entry ; CHECK-NEXT: ;;#ASMEND @@ -33,40 +32,41 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: .LBB0_3: ; %for.body33 ; CHECK-NEXT: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB0_6 Depth 2 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_2 ; CHECK-NEXT: ; %bb.4: ; %for.body51.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: s_mov_b32 s9, 4 ; CHECK-NEXT: s_mov_b32 s8, 0 -; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: s_branch .LBB0_6 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_5: ; %if.end118 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: v_add_nc_u32_e32 v4, 4, v1 ; CHECK-NEXT: s_add_i32 s9, s9, 4 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; backedge ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_add_nc_u32_e32 v3, s9, v2 -; CHECK-NEXT: v_cmp_ge_u32_e64 s5, v3, v0 +; CHECK-NEXT: v_cmp_ge_u32_e64 s5, v4, v0 ; CHECK-NEXT: s_or_b32 s8, s5, s8 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execz .LBB0_1 ; CHECK-NEXT: .LBB0_6: ; %for.body51 ; CHECK-NEXT: ; Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: v_mov_b32_e32 v1, v4 +; CHECK-NEXT: v_mov_b32_e32 v3, 1 ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB0_5 ; CHECK-NEXT: ; %bb.7: ; %if.then112 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 -; CHECK-NEXT: s_add_i32 s10, s9, 4 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_mov_b32_e32 v3, s10 -; CHECK-NEXT: ds_write_b32 v1, v3 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, s9 +; CHECK-NEXT: ds_write_b32 v3, v4 ; CHECK-NEXT: s_branch .LBB0_5 ; CHECK-NEXT: .LBB0_8: ; %for.body159.preheader ; CHECK-NEXT: s_inst_prefetch 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index 680942fcb4d4b..15c4e746b1e07 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -62,7 +62,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] -; CHECK-NEXT: v_mov_b32_e32 v45, 0 +; CHECK-NEXT: v_mov_b32_e32 v46, 0 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v43, v0 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -91,7 +91,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s12, s51 ; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: ds_write_b32 v45, v45 offset:15360 +; CHECK-NEXT: ds_write_b32 v46, v46 offset:15360 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v43 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v43 @@ -118,69 +118,66 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_cmpx_ne_u32_e32 0, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_25 ; CHECK-NEXT: ; %bb.1: ; %.preheader5 -; CHECK-NEXT: v_mul_lo_u32 v0, v41, 14 +; CHECK-NEXT: v_mul_lo_u32 v44, v41, 14 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_mov_b32 s5, 0 -; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v0 +; CHECK-NEXT: v_add_nc_u32_e32 v45, 0x3c04, v44 ; CHECK-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_nc_u32_e32 v1, s5, v44 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s5, v45 ; CHECK-NEXT: s_add_i32 s5, s5, 1 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v42 -; CHECK-NEXT: ds_write_b8 v1, v45 +; CHECK-NEXT: ds_write_b8 v0, v46 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB0_2 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v42 +; CHECK-NEXT: v_add_nc_u32_e32 v46, -1, v42 ; CHECK-NEXT: s_mov_b32 s53, 0 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v46 ; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB0_25 ; CHECK-NEXT: ; %bb.4: ; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43 -; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0 ; CHECK-NEXT: v_mov_b32_e32 v47, 0 -; CHECK-NEXT: s_mov_b32 s55, 0 +; CHECK-NEXT: s_mov_b32 s54, 0 ; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB0_8 Depth 2 ; CHECK-NEXT: ; Child Loop BB0_20 Depth 2 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s55, v44 -; CHECK-NEXT: s_lshl_b32 s4, s55, 5 -; CHECK-NEXT: s_add_i32 s54, s55, 1 -; CHECK-NEXT: s_add_i32 s5, s55, 5 -; CHECK-NEXT: v_or3_b32 v57, s4, v43, s54 +; CHECK-NEXT: s_mov_b32 s4, s54 +; CHECK-NEXT: s_lshl_b32 s5, s54, 5 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v45 +; CHECK-NEXT: s_add_i32 s54, s54, 1 +; CHECK-NEXT: s_add_i32 s4, s4, 5 +; CHECK-NEXT: v_or3_b32 v57, s5, v43, s54 +; CHECK-NEXT: v_mov_b32_e32 v58, s54 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: ds_read_u8 v56, v0 -; CHECK-NEXT: v_mov_b32_e32 v58, s54 -; CHECK-NEXT: s_mov_b32 s68, exec_lo -; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42 +; CHECK-NEXT: s_mov_b32 s55, exec_lo +; CHECK-NEXT: v_cmpx_lt_u32_e64 s4, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_17 ; CHECK-NEXT: ; %bb.6: ; %.preheader2 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_mov_b32 s69, 0 -; CHECK-NEXT: s_mov_b32 s80, 0 +; CHECK-NEXT: s_mov_b32 s68, 0 +; CHECK-NEXT: s_mov_b32 s69, s54 ; CHECK-NEXT: s_branch .LBB0_8 ; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 -; CHECK-NEXT: s_add_i32 s80, s80, 4 -; CHECK-NEXT: s_add_i32 s4, s55, s80 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s80, v57 -; CHECK-NEXT: s_add_i32 s5, s4, 5 -; CHECK-NEXT: s_add_i32 s4, s4, 1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42 -; CHECK-NEXT: v_mov_b32_e32 v58, s4 -; CHECK-NEXT: s_or_b32 s69, vcc_lo, s69 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s69 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80 +; CHECK-NEXT: s_add_i32 s4, s69, 4 +; CHECK-NEXT: v_add_nc_u32_e32 v57, 4, v57 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s4, v42 +; CHECK-NEXT: v_mov_b32_e32 v58, s69 +; CHECK-NEXT: s_or_b32 s68, vcc_lo, s68 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s68 ; CHECK-NEXT: s_cbranch_execz .LBB0_16 ; CHECK-NEXT: .LBB0_8: ; Parent Loop BB0_5 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_add_nc_u32_e32 v59, s80, v46 -; CHECK-NEXT: v_add_nc_u32_e32 v58, s80, v57 -; CHECK-NEXT: ds_read_u8 v0, v59 +; CHECK-NEXT: v_add_nc_u32_e32 v58, s69, v45 +; CHECK-NEXT: s_mov_b32 s69, s4 +; CHECK-NEXT: ds_read_u8 v0, v58 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s81, s4 +; CHECK-NEXT: v_cmp_eq_u16_sdwa s5, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD +; CHECK-NEXT: s_and_saveexec_b32 s80, s5 ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -199,13 +196,13 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CHECK-NEXT: ds_write_b32 v0, v58 +; CHECK-NEXT: ds_write_b32 v0, v57 ; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 -; CHECK-NEXT: ds_read_u8 v0, v59 offset:1 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80 +; CHECK-NEXT: ds_read_u8 v0, v58 offset:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s81, s4 +; CHECK-NEXT: s_and_saveexec_b32 s80, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_12 ; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -221,17 +218,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s12, s51 ; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: v_add_nc_u32_e32 v60, 1, v58 +; CHECK-NEXT: v_add_nc_u32_e32 v59, 1, v57 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CHECK-NEXT: ds_write_b32 v0, v60 +; CHECK-NEXT: ds_write_b32 v0, v59 ; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 -; CHECK-NEXT: ds_read_u8 v0, v59 offset:2 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80 +; CHECK-NEXT: ds_read_u8 v0, v58 offset:2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s81, s4 +; CHECK-NEXT: s_and_saveexec_b32 s80, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_14 ; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -247,17 +244,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s12, s51 ; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: v_add_nc_u32_e32 v60, 2, v58 +; CHECK-NEXT: v_add_nc_u32_e32 v59, 2, v57 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CHECK-NEXT: ds_write_b32 v0, v60 +; CHECK-NEXT: ds_write_b32 v0, v59 ; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 -; CHECK-NEXT: ds_read_u8 v0, v59 offset:3 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80 +; CHECK-NEXT: ds_read_u8 v0, v58 offset:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s81, s4 +; CHECK-NEXT: s_and_saveexec_b32 s80, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_7 ; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -273,19 +270,18 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s12, s51 ; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v58 +; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v57 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v58 ; CHECK-NEXT: s_branch .LBB0_7 -; CHECK-NEXT: .LBB0_16: ; %Flow43 -; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s69 -; CHECK-NEXT: v_mov_b32_e32 v57, v0 -; CHECK-NEXT: .LBB0_17: ; %Flow44 +; CHECK-NEXT: .LBB0_16: ; %Flow32 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68 +; CHECK-NEXT: .LBB0_17: ; %Flow33 +; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: s_mov_b32 s55, exec_lo ; CHECK-NEXT: v_cmpx_lt_u32_e64 v58, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_23 @@ -306,7 +302,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v58 -; CHECK-NEXT: ds_read_u8 v0, v0 +; CHECK-NEXT: ds_read_u8 v0, v0 offset:15364 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD ; CHECK-NEXT: s_and_saveexec_b32 s69, s4 @@ -330,24 +326,22 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v57 ; CHECK-NEXT: s_branch .LBB0_19 -; CHECK-NEXT: .LBB0_22: ; %Flow41 +; CHECK-NEXT: .LBB0_22: ; %Flow30 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: s_inst_prefetch 0x2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68 -; CHECK-NEXT: .LBB0_23: ; %Flow42 +; CHECK-NEXT: .LBB0_23: ; %Flow31 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s54, v45 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s54, v46 ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47 -; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46 -; CHECK-NEXT: s_mov_b32 s55, s54 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 ; CHECK-NEXT: s_or_b32 s53, s4, s53 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53 ; CHECK-NEXT: s_cbranch_execnz .LBB0_5 -; CHECK-NEXT: .LBB0_25: ; %Flow49 +; CHECK-NEXT: .LBB0_25: ; %Flow38 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 @@ -828,7 +822,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_mul_lo_u32 v46, v0, 14 +; CHECK-NEXT: v_mul_lo_u32 v44, v0, 14 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -842,7 +836,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: ds_write_b32 v43, v43 offset:15360 -; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v46 +; CHECK-NEXT: v_add_nc_u32_e32 v45, 0x3c04, v44 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v42 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v42 @@ -867,51 +861,48 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: v_mov_b32_e32 v41, v0 ; CHECK-NEXT: v_lshlrev_b32_e32 v42, 10, v42 ; CHECK-NEXT: s_mov_b32 s52, 0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: ds_write_b8 v46, v43 offset:15364 -; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v41 +; CHECK-NEXT: s_mov_b32 s53, 0 +; CHECK-NEXT: ds_write_b8 v44, v43 offset:15364 +; CHECK-NEXT: v_add_nc_u32_e32 v46, -1, v41 ; CHECK-NEXT: .LBB1_1: ; %.37 ; CHECK-NEXT: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB1_3 Depth 2 ; CHECK-NEXT: ; Child Loop BB1_8 Depth 2 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44 -; CHECK-NEXT: s_lshl_b32 s5, s4, 5 -; CHECK-NEXT: s_add_i32 s53, s4, 1 -; CHECK-NEXT: s_add_i32 s6, s4, 5 -; CHECK-NEXT: v_or3_b32 v47, s5, v42, s53 +; CHECK-NEXT: s_mov_b32 s4, s53 +; CHECK-NEXT: s_lshl_b32 s6, s53, 5 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v45 +; CHECK-NEXT: s_add_i32 s53, s53, 1 +; CHECK-NEXT: s_add_i32 s5, s4, 5 +; CHECK-NEXT: v_or3_b32 v56, s6, v42, s53 +; CHECK-NEXT: v_mov_b32_e32 v57, s53 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v46, v0 -; CHECK-NEXT: v_mov_b32_e32 v56, s53 -; CHECK-NEXT: s_mov_b32 s5, exec_lo -; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41 +; CHECK-NEXT: ds_read_u8 v47, v0 +; CHECK-NEXT: s_mov_b32 s4, exec_lo +; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v41 ; CHECK-NEXT: s_cbranch_execz .LBB1_5 ; CHECK-NEXT: ; %bb.2: ; %.53.preheader ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_mov_b32 s6, 0 -; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .LBB1_3: ; %.53 ; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: s_add_i32 s7, s7, 4 +; CHECK-NEXT: s_add_i32 s7, s5, 4 ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43 -; CHECK-NEXT: s_add_i32 s8, s4, s7 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v47 -; CHECK-NEXT: s_add_i32 s9, s8, 5 -; CHECK-NEXT: s_add_i32 s8, s8, 1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41 -; CHECK-NEXT: v_mov_b32_e32 v56, s8 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s7, v41 +; CHECK-NEXT: v_add_nc_u32_e32 v56, 4, v56 +; CHECK-NEXT: v_mov_b32_e32 v57, s5 +; CHECK-NEXT: s_mov_b32 s5, s7 ; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB1_3 -; CHECK-NEXT: ; %bb.4: ; %Flow3 +; CHECK-NEXT: ; %bb.4: ; %Flow4 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; CHECK-NEXT: v_mov_b32_e32 v47, v0 -; CHECK-NEXT: .LBB1_5: ; %Flow4 +; CHECK-NEXT: .LBB1_5: ; %Flow5 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_mov_b32 s54, exec_lo -; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41 +; CHECK-NEXT: v_cmpx_lt_u32_e64 v57, v41 ; CHECK-NEXT: s_cbranch_execz .LBB1_11 ; CHECK-NEXT: ; %bb.6: ; %.103.preheader ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 @@ -922,19 +913,19 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: .LBB1_7: ; %.114 ; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s64 +; CHECK-NEXT: v_add_nc_u32_e32 v57, 1, v57 ; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56 -; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v57, v41 ; CHECK-NEXT: s_or_b32 s55, vcc_lo, s55 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: s_cbranch_execz .LBB1_10 ; CHECK-NEXT: .LBB1_8: ; %.103 ; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v56 -; CHECK-NEXT: ds_read_u8 v0, v0 +; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v57 +; CHECK-NEXT: ds_read_u8 v0, v0 offset:15364 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD +; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v47, v0 src0_sel:BYTE_0 src1_sel:DWORD ; CHECK-NEXT: s_and_saveexec_b32 s64, s4 ; CHECK-NEXT: s_cbranch_execz .LBB1_7 ; CHECK-NEXT: ; %bb.9: ; %.110 @@ -955,23 +946,22 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CHECK-NEXT: ds_write_b32 v0, v47 +; CHECK-NEXT: ds_write_b32 v0, v56 ; CHECK-NEXT: s_branch .LBB1_7 ; CHECK-NEXT: .LBB1_10: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_inst_prefetch 0x2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 -; CHECK-NEXT: .LBB1_11: ; %Flow2 +; CHECK-NEXT: .LBB1_11: ; %Flow3 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s54 ; CHECK-NEXT: ; %bb.12: ; %.32 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v45 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v46 ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 ; CHECK-NEXT: s_or_b32 s52, s4, s52 -; CHECK-NEXT: s_mov_b32 s4, s53 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52 ; CHECK-NEXT: s_cbranch_execnz .LBB1_1 ; CHECK-NEXT: ; %bb.13: ; %.119 diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 06213ef3e06ea..9150bd0dfcd30 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -5396,759 +5396,762 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execz .LBB5_3 ; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0x800 ; CHECK-NEXT: .LBB5_2: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo -; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224 -; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240 -; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[96:97] offset:192 -; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[96:97] offset:208 -; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[96:97] offset:160 -; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[96:97] offset:176 -; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[96:97] offset:128 -; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[96:97] offset:144 -; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[96:97] offset:96 -; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[96:97] offset:112 -; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[96:97] offset:64 -; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[96:97] offset:80 -; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[96:97] offset:32 -; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[96:97] offset:48 -; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[96:97] -; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] offset:16 -; CHECK-NEXT: s_add_u32 s4, s4, 0x100 -; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:224 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] offset:240 +; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[2:3] offset:192 +; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[2:3] offset:208 +; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[2:3] offset:160 +; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[2:3] offset:176 +; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[2:3] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[2:3] offset:144 +; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[2:3] offset:96 +; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[2:3] offset:112 +; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[2:3] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[2:3] offset:80 +; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[2:3] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[2:3] offset:48 +; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[2:3] +; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[2:3] offset:16 +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: s_addc_u32 s5, s5, -1 ; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:224 ; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[12:15] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:208 ; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[20:23] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[24:27] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[28:31] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[32:35] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[36:39] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[48:51] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[52:55] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[64:67] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[68:71] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[80:83] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[84:87] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 -; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[96:99] offset:16 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB5_2 -; CHECK-NEXT: .LBB5_3: ; %Flow5 -; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6 +; CHECK-NEXT: .LBB5_3: ; %Flow15 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6 ; CHECK-NEXT: s_cbranch_execz .LBB5_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader -; CHECK-NEXT: s_movk_i32 s6, 0xff00 -; CHECK-NEXT: s_mov_b64 s[4:5], 0x700 -; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0x700, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x700, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; CHECK-NEXT: s_movk_i32 s4, 0xf800 +; CHECK-NEXT: s_mov_b32 s5, -1 ; CHECK-NEXT: .LBB5_5: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo -; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224 -; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240 -; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[96:97] offset:192 -; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[96:97] offset:208 -; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[96:97] offset:160 -; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[96:97] offset:176 -; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[96:97] offset:128 -; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[96:97] offset:144 -; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[96:97] offset:96 -; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[96:97] offset:112 -; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[96:97] offset:64 -; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[96:97] offset:80 -; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[96:97] offset:32 -; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[96:97] offset:48 -; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[96:97] -; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] offset:16 -; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 -; CHECK-NEXT: s_addc_u32 s5, s5, -1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:224 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] offset:240 +; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[2:3] offset:192 +; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[2:3] offset:208 +; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[2:3] offset:160 +; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[2:3] offset:176 +; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[2:3] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[2:3] offset:144 +; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[2:3] offset:96 +; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[2:3] offset:112 +; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[2:3] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[2:3] offset:80 +; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[2:3] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[2:3] offset:48 +; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[2:3] +; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[2:3] offset:16 +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff00, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:224 ; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[12:15] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:208 ; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[20:23] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[24:27] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[28:31] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[32:35] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[36:39] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[48:51] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[52:55] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[64:67] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[68:71] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[80:83] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[84:87] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 -; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[96:99] offset:16 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffff00, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB5_5 -; CHECK-NEXT: .LBB5_6: ; %Flow6 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: .LBB5_6: ; %Flow16 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; ; ALIGNED-LABEL: memmove_p0_p0_sz2048: ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_mov_b32 s4, exec_lo ; ALIGNED-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 ; ALIGNED-NEXT: s_cbranch_execz .LBB5_3 ; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x800 ; ALIGNED-NEXT: .LBB5_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, s5, v3, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[20:21] offset:240 -; ALIGNED-NEXT: flat_load_dwordx4 v[22:25], v[20:21] offset:224 -; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[20:21] -; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[20:21] offset:16 -; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[20:21] offset:32 -; ALIGNED-NEXT: flat_load_dwordx4 v[98:101], v[20:21] offset:48 -; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[20:21] offset:64 -; ALIGNED-NEXT: flat_load_dwordx4 v[82:85], v[20:21] offset:80 -; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[20:21] offset:96 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[20:21] offset:112 -; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[20:21] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[50:53], v[20:21] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[20:21] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[34:37], v[20:21] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[20:21] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[20:21] offset:208 +; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[2:3] offset:240 +; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[2:3] offset:224 +; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[2:3] +; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[2:3] offset:16 +; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[2:3] offset:32 +; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[2:3] offset:48 +; ALIGNED-NEXT: flat_load_dwordx4 v[84:87], v[2:3] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[80:83], v[2:3] offset:80 +; ALIGNED-NEXT: flat_load_dwordx4 v[68:71], v[2:3] offset:96 +; ALIGNED-NEXT: flat_load_dwordx4 v[64:67], v[2:3] offset:112 +; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[2:3] offset:128 +; ALIGNED-NEXT: flat_load_dwordx4 v[48:51], v[2:3] offset:144 +; ALIGNED-NEXT: flat_load_dwordx4 v[36:39], v[2:3] offset:160 +; ALIGNED-NEXT: flat_load_dwordx4 v[32:35], v[2:3] offset:176 +; ALIGNED-NEXT: flat_load_dwordx4 v[28:31], v[2:3] offset:192 +; ALIGNED-NEXT: flat_load_dwordx4 v[24:27], v[2:3] offset:208 +; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 +; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo -; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 -; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:254 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:252 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v19 offset:254 +; ALIGNED-NEXT: flat_store_byte v[0:1], v19 offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:250 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:248 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v18 offset:250 +; ALIGNED-NEXT: flat_store_byte v[0:1], v18 offset:248 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:246 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v17 offset:246 +; ALIGNED-NEXT: flat_store_byte v[0:1], v17 offset:244 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:242 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:240 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v16 offset:242 +; ALIGNED-NEXT: flat_store_byte v[0:1], v16 offset:240 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:238 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v23 offset:238 +; ALIGNED-NEXT: flat_store_byte v[0:1], v23 offset:236 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:234 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v22 offset:234 +; ALIGNED-NEXT: flat_store_byte v[0:1], v22 offset:232 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:230 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v21 offset:230 +; ALIGNED-NEXT: flat_store_byte v[0:1], v21 offset:228 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:226 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:224 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v20 offset:226 +; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:224 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:222 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:220 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:218 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:216 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:214 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:212 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:210 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:208 -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:206 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:202 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:198 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:194 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:192 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:190 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:186 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:182 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:178 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:176 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:174 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:172 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:170 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:168 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:166 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:164 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:162 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:160 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:158 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:154 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:152 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:150 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:148 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:146 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:144 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:142 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:138 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:134 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:130 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:128 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v27 offset:222 +; ALIGNED-NEXT: flat_store_byte v[0:1], v27 offset:220 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:122 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:120 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v26 offset:218 +; ALIGNED-NEXT: flat_store_byte v[0:1], v26 offset:216 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:118 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v25 offset:214 +; ALIGNED-NEXT: flat_store_byte v[0:1], v25 offset:212 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:114 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:112 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v24 offset:210 +; ALIGNED-NEXT: flat_store_byte v[0:1], v24 offset:208 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:110 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:106 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:102 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:96 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:94 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:92 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:90 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:88 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:84 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:80 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:78 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:76 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v31 offset:206 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:204 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:74 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:72 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v30 offset:202 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:200 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v29 offset:198 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:196 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:64 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v28 offset:194 +; ALIGNED-NEXT: flat_store_byte v[0:1], v28 offset:192 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:62 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:60 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:58 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:56 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:52 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v35 offset:190 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v34 offset:186 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v33 offset:182 +; ALIGNED-NEXT: flat_store_byte v[0:1], v33 offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v32 offset:178 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v39 offset:174 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:172 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v38 offset:170 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:168 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v37 offset:166 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:164 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v36 offset:162 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:160 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:24 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:20 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v51 offset:158 +; ALIGNED-NEXT: flat_store_byte v[0:1], v51 offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v50 offset:154 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v49 offset:150 +; ALIGNED-NEXT: flat_store_byte v[0:1], v49 offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v48 offset:146 +; ALIGNED-NEXT: flat_store_byte v[0:1], v48 offset:144 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v29 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v55 offset:142 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v54 offset:138 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v53 offset:134 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:132 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v52 offset:130 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:128 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v67 offset:126 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v66 offset:122 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v65 offset:118 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v64 offset:114 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:112 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v71 offset:110 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v70 offset:106 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v69 offset:102 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:100 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v68 offset:98 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:96 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v83 offset:94 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v82 offset:90 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:88 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v81 offset:86 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:84 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v80 offset:82 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:80 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v87 offset:78 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:76 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v86 offset:74 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:72 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v85 offset:70 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:68 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v84 offset:66 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:64 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v99 offset:62 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:60 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v98 offset:58 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:56 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v97 offset:54 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v96 offset:50 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:48 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v15 offset:42 +; ALIGNED-NEXT: flat_store_byte v[0:1], v15 offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v14 offset:46 +; ALIGNED-NEXT: flat_store_byte v[0:1], v14 offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v13 offset:34 +; ALIGNED-NEXT: flat_store_byte v[0:1], v13 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v12 offset:38 +; ALIGNED-NEXT: flat_store_byte v[0:1], v12 offset:36 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v11 offset:30 +; ALIGNED-NEXT: flat_store_byte v[0:1], v11 offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:26 +; ALIGNED-NEXT: flat_store_byte v[0:1], v10 offset:24 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:22 +; ALIGNED-NEXT: flat_store_byte v[0:1], v9 offset:20 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:18 +; ALIGNED-NEXT: flat_store_byte v[0:1], v8 offset:16 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v27 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:245 +; ALIGNED-NEXT: flat_store_byte v[0:1], v19 offset:253 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v26 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 +; ALIGNED-NEXT: flat_store_byte v[0:1], v27 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: flat_store_byte v[0:1], v18 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[0:1], v17 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[0:1], v16 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 +; ALIGNED-NEXT: flat_store_byte v[0:1], v23 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:235 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[0:1], v22 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v32 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: flat_store_byte v[0:1], v21 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:227 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v37 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v36 +; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v36 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v34 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v49 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[0:1], v19 offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v49 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[0:1], v26 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v48 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v65 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v8 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v54 +; ALIGNED-NEXT: flat_store_byte v[0:1], v25 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v54 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v69 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[0:1], v18 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[0:1], v24 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:197 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:205 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v66 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[0:1], v17 offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v70 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[0:1], v16 offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[0:1], v28 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v83 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:181 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:189 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v82 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v87 +; ALIGNED-NEXT: flat_store_byte v[0:1], v23 offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v86 +; ALIGNED-NEXT: flat_store_byte v[0:1], v33 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v86 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v99 +; ALIGNED-NEXT: flat_store_byte v[0:1], v22 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v99 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:165 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:173 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v98 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[0:1], v21 offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v14 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:159 +; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:163 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v12 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v12 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v11 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v51 offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: flat_store_byte v[0:1], v27 offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:147 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:145 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:143 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:141 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:139 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:137 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:133 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:131 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:129 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:127 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:123 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:121 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:117 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:115 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:113 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:111 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:109 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:107 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:105 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:95 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:93 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:91 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:89 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:79 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:77 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:75 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:73 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:63 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:61 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:59 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:57 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:43 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:47 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 +; ALIGNED-NEXT: flat_store_byte v[0:1], v19 offset:151 +; ALIGNED-NEXT: flat_store_byte v[0:1], v49 offset:149 +; ALIGNED-NEXT: flat_store_byte v[0:1], v26 offset:147 +; ALIGNED-NEXT: flat_store_byte v[0:1], v48 offset:145 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:143 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:141 +; ALIGNED-NEXT: flat_store_byte v[0:1], v25 offset:139 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:137 +; ALIGNED-NEXT: flat_store_byte v[0:1], v18 offset:135 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:133 +; ALIGNED-NEXT: flat_store_byte v[0:1], v24 offset:131 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:129 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:127 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:125 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:123 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:121 +; ALIGNED-NEXT: flat_store_byte v[0:1], v17 offset:119 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:117 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:115 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:113 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:111 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:109 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:107 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:105 +; ALIGNED-NEXT: flat_store_byte v[0:1], v16 offset:103 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:101 +; ALIGNED-NEXT: flat_store_byte v[0:1], v28 offset:99 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:97 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:95 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:93 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:91 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:89 +; ALIGNED-NEXT: flat_store_byte v[0:1], v23 offset:87 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:85 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:83 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:81 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:79 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:77 +; ALIGNED-NEXT: flat_store_byte v[0:1], v33 offset:75 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:73 +; ALIGNED-NEXT: flat_store_byte v[0:1], v22 offset:71 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:69 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:67 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:65 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:63 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:61 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:59 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:57 +; ALIGNED-NEXT: flat_store_byte v[0:1], v21 offset:55 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:53 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:51 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:49 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:43 +; ALIGNED-NEXT: flat_store_byte v[0:1], v15 offset:41 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:47 +; ALIGNED-NEXT: flat_store_byte v[0:1], v14 offset:45 +; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:35 +; ALIGNED-NEXT: flat_store_byte v[0:1], v13 offset:33 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:39 +; ALIGNED-NEXT: flat_store_byte v[0:1], v12 offset:37 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:31 +; ALIGNED-NEXT: flat_store_byte v[0:1], v11 offset:29 +; ALIGNED-NEXT: flat_store_byte v[0:1], v51 offset:27 +; ALIGNED-NEXT: flat_store_byte v[0:1], v10 offset:25 +; ALIGNED-NEXT: flat_store_byte v[0:1], v27 offset:23 +; ALIGNED-NEXT: flat_store_byte v[0:1], v9 offset:21 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:19 +; ALIGNED-NEXT: flat_store_byte v[0:1], v8 offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v7 offset:14 +; ALIGNED-NEXT: flat_store_byte v[0:1], v7 offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v6 offset:10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v6 offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v5 offset:6 +; ALIGNED-NEXT: flat_store_byte v[0:1], v5 offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[0:1], v4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 @@ -6157,378 +6160,364 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:7 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:5 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:3 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 +; ALIGNED-NEXT: flat_store_byte v[0:1], v8 offset:15 +; ALIGNED-NEXT: flat_store_byte v[0:1], v7 offset:13 +; ALIGNED-NEXT: flat_store_byte v[0:1], v9 offset:11 +; ALIGNED-NEXT: flat_store_byte v[0:1], v6 offset:9 +; ALIGNED-NEXT: flat_store_byte v[0:1], v10 offset:7 +; ALIGNED-NEXT: flat_store_byte v[0:1], v5 offset:5 +; ALIGNED-NEXT: flat_store_byte v[0:1], v11 offset:3 +; ALIGNED-NEXT: flat_store_byte v[0:1], v4 offset:1 +; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; ALIGNED-NEXT: s_cbranch_scc1 .LBB5_2 -; ALIGNED-NEXT: .LBB5_3: ; %Flow5 -; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 +; ALIGNED-NEXT: .LBB5_3: ; %Flow15 +; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6 ; ALIGNED-NEXT: s_cbranch_execz .LBB5_6 ; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader -; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 -; ALIGNED-NEXT: s_mov_b32 s7, -1 +; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x700, v2 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v14, vcc_lo, 0x700, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, vcc_lo +; ALIGNED-NEXT: s_movk_i32 s4, 0xf800 +; ALIGNED-NEXT: s_mov_b32 s5, -1 ; ALIGNED-NEXT: .LBB5_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:240 -; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:224 -; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[24:25] -; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:16 -; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[24:25] offset:32 -; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] offset:48 -; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[24:25] offset:64 -; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[24:25] offset:80 -; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[24:25] offset:96 -; ALIGNED-NEXT: flat_load_dwordx4 v[32:35], v[24:25] offset:112 -; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[24:25] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[24:25] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[81:84], v[24:25] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[24:25] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[24:25] offset:208 +; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[12:13] offset:240 +; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[12:13] offset:224 +; ALIGNED-NEXT: flat_load_dwordx4 v[0:3], v[12:13] +; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[12:13] offset:16 +; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[12:13] offset:32 +; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[12:13] offset:48 +; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[12:13] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[12:13] offset:80 +; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[12:13] offset:96 +; ALIGNED-NEXT: flat_load_dwordx4 v[27:30], v[12:13] offset:112 +; ALIGNED-NEXT: flat_load_dwordx4 v[34:37], v[12:13] offset:128 +; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[12:13] offset:144 +; ALIGNED-NEXT: flat_load_dwordx4 v[54:57], v[12:13] offset:160 +; ALIGNED-NEXT: flat_load_dwordx4 v[68:71], v[12:13] offset:176 +; ALIGNED-NEXT: flat_load_dwordx4 v[82:85], v[12:13] offset:192 +; ALIGNED-NEXT: flat_load_dwordx4 v[64:67], v[12:13] offset:208 +; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0xffffff00, v12 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:348 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo -; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 -; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:254 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:252 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v33 offset:254 +; ALIGNED-NEXT: flat_store_byte v[14:15], v33 offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:250 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:248 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v32 offset:250 +; ALIGNED-NEXT: flat_store_byte v[14:15], v32 offset:248 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:246 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v31 offset:246 +; ALIGNED-NEXT: flat_store_byte v[14:15], v31 offset:244 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:242 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:240 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v26 offset:242 +; ALIGNED-NEXT: flat_store_byte v[14:15], v26 offset:240 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:364 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:348 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:352 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:238 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v53 offset:238 +; ALIGNED-NEXT: flat_store_byte v[14:15], v53 offset:236 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:234 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v52 offset:234 +; ALIGNED-NEXT: flat_store_byte v[14:15], v52 offset:232 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:230 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v51 offset:230 +; ALIGNED-NEXT: flat_store_byte v[14:15], v51 offset:228 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:226 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:224 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v50 offset:226 +; ALIGNED-NEXT: flat_store_byte v[14:15], v50 offset:224 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:222 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:220 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:218 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:216 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:214 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:212 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:210 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:208 -; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:206 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:202 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:198 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:194 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:192 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v81 offset:222 +; ALIGNED-NEXT: flat_store_byte v[14:15], v81 offset:220 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v80 offset:218 +; ALIGNED-NEXT: flat_store_byte v[14:15], v80 offset:216 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v67 offset:214 +; ALIGNED-NEXT: flat_store_byte v[14:15], v67 offset:212 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v66 offset:210 +; ALIGNED-NEXT: flat_store_byte v[14:15], v66 offset:208 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:332 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:320 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:190 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:188 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v98 offset:206 +; ALIGNED-NEXT: flat_store_byte v[14:15], v98 offset:204 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:186 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:184 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v87 offset:202 +; ALIGNED-NEXT: flat_store_byte v[14:15], v87 offset:200 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:182 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v86 offset:198 +; ALIGNED-NEXT: flat_store_byte v[14:15], v86 offset:196 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:178 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:176 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v82 offset:194 +; ALIGNED-NEXT: flat_store_byte v[14:15], v82 offset:192 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:412 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:174 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:172 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:170 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:168 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:166 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:164 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:162 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:160 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v99 offset:190 +; ALIGNED-NEXT: flat_store_byte v[14:15], v99 offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v96 offset:186 +; ALIGNED-NEXT: flat_store_byte v[14:15], v96 offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v83 offset:182 +; ALIGNED-NEXT: flat_store_byte v[14:15], v83 offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v68 offset:178 +; ALIGNED-NEXT: flat_store_byte v[14:15], v68 offset:176 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:428 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:364 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:416 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:158 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:156 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v97 offset:174 +; ALIGNED-NEXT: flat_store_byte v[14:15], v97 offset:172 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:154 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:152 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v84 offset:170 +; ALIGNED-NEXT: flat_store_byte v[14:15], v84 offset:168 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:150 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v69 offset:166 +; ALIGNED-NEXT: flat_store_byte v[14:15], v69 offset:164 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:146 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:144 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v54 offset:162 +; ALIGNED-NEXT: flat_store_byte v[14:15], v54 offset:160 ; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368 ; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372 ; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 ; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:380 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:380 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:142 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:138 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:134 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:130 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:128 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v85 offset:158 +; ALIGNED-NEXT: flat_store_byte v[14:15], v85 offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v70 offset:154 +; ALIGNED-NEXT: flat_store_byte v[14:15], v70 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v55 offset:150 +; ALIGNED-NEXT: flat_store_byte v[14:15], v55 offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v38 offset:146 +; ALIGNED-NEXT: flat_store_byte v[14:15], v38 offset:144 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:396 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:122 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:120 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:118 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:116 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v71 offset:142 +; ALIGNED-NEXT: flat_store_byte v[14:15], v71 offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v64 offset:138 +; ALIGNED-NEXT: flat_store_byte v[14:15], v64 offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v39 offset:134 +; ALIGNED-NEXT: flat_store_byte v[14:15], v39 offset:132 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v34 offset:130 +; ALIGNED-NEXT: flat_store_byte v[14:15], v34 offset:128 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:476 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:110 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:106 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:102 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:96 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v65 offset:126 +; ALIGNED-NEXT: flat_store_byte v[14:15], v65 offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v48 offset:122 +; ALIGNED-NEXT: flat_store_byte v[14:15], v48 offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v35 offset:118 +; ALIGNED-NEXT: flat_store_byte v[14:15], v35 offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v27 offset:114 +; ALIGNED-NEXT: flat_store_byte v[14:15], v27 offset:112 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:492 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:480 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:94 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:92 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v49 offset:110 +; ALIGNED-NEXT: flat_store_byte v[14:15], v49 offset:108 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:90 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:88 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v36 offset:106 +; ALIGNED-NEXT: flat_store_byte v[14:15], v36 offset:104 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v28 offset:102 +; ALIGNED-NEXT: flat_store_byte v[14:15], v28 offset:100 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:80 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v22 offset:98 +; ALIGNED-NEXT: flat_store_byte v[14:15], v22 offset:96 ; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:432 ; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:436 ; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:440 ; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:444 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:436 ; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:432 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:78 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:76 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v37 offset:94 +; ALIGNED-NEXT: flat_store_byte v[14:15], v37 offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:74 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:72 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v29 offset:90 +; ALIGNED-NEXT: flat_store_byte v[14:15], v29 offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v23 offset:86 +; ALIGNED-NEXT: flat_store_byte v[14:15], v23 offset:84 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:64 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v19 offset:82 +; ALIGNED-NEXT: flat_store_byte v[14:15], v19 offset:80 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:460 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:524 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:62 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:60 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v30 offset:78 +; ALIGNED-NEXT: flat_store_byte v[14:15], v30 offset:76 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:58 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:56 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v24 offset:74 +; ALIGNED-NEXT: flat_store_byte v[14:15], v24 offset:72 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v20 offset:70 +; ALIGNED-NEXT: flat_store_byte v[14:15], v20 offset:68 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:532 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v17 offset:66 +; ALIGNED-NEXT: flat_store_byte v[14:15], v17 offset:64 +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:540 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v25 offset:62 +; ALIGNED-NEXT: flat_store_byte v[14:15], v25 offset:60 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v21 offset:58 +; ALIGNED-NEXT: flat_store_byte v[14:15], v21 offset:56 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v18 offset:54 +; ALIGNED-NEXT: flat_store_byte v[14:15], v18 offset:52 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v16 offset:50 +; ALIGNED-NEXT: flat_store_byte v[14:15], v16 offset:48 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:556 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:556 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:548 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v11 offset:42 +; ALIGNED-NEXT: flat_store_byte v[14:15], v11 offset:40 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:24 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v10 offset:46 +; ALIGNED-NEXT: flat_store_byte v[14:15], v10 offset:44 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:20 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v9 offset:34 +; ALIGNED-NEXT: flat_store_byte v[14:15], v9 offset:32 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v8 offset:38 +; ALIGNED-NEXT: flat_store_byte v[14:15], v8 offset:36 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:500 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 @@ -6538,274 +6527,295 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:504 ; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v7 offset:30 +; ALIGNED-NEXT: flat_store_byte v[14:15], v7 offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v6 offset:26 +; ALIGNED-NEXT: flat_store_byte v[14:15], v6 offset:24 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v5 offset:22 +; ALIGNED-NEXT: flat_store_byte v[14:15], v5 offset:20 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v4 offset:18 +; ALIGNED-NEXT: flat_store_byte v[14:15], v4 offset:16 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: flat_store_byte v[14:15], v100 offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[14:15], v33 offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[14:15], v101 offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[14:15], v81 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v70 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 +; ALIGNED-NEXT: flat_store_byte v[14:15], v32 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; ALIGNED-NEXT: flat_store_byte v[14:15], v102 offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: flat_store_byte v[14:15], v31 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v87 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 +; ALIGNED-NEXT: flat_store_byte v[14:15], v103 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v86 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 +; ALIGNED-NEXT: flat_store_byte v[14:15], v26 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[14:15], v112 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 +; ALIGNED-NEXT: flat_store_byte v[14:15], v53 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v96 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[14:15], v113 offset:235 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v83 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[14:15], v52 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v68 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[14:15], v114 offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: flat_store_byte v[14:15], v51 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[14:15], v115 offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v69 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 +; ALIGNED-NEXT: flat_store_byte v[14:15], v50 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v54 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[14:15], v100 offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[14:15], v33 offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v55 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 +; ALIGNED-NEXT: flat_store_byte v[14:15], v80 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[14:15], v101 offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[14:15], v70 offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[14:15], v67 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[14:15], v32 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v39 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 +; ALIGNED-NEXT: flat_store_byte v[14:15], v66 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v34 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[14:15], v102 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[14:15], v98 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[14:15], v31 offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v35 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[14:15], v87 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[14:15], v103 offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[14:15], v86 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[14:15], v26 offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v28 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 +; ALIGNED-NEXT: flat_store_byte v[14:15], v82 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v22 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 +; ALIGNED-NEXT: flat_store_byte v[14:15], v112 offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[14:15], v99 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v29 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 +; ALIGNED-NEXT: flat_store_byte v[14:15], v53 offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v23 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 +; ALIGNED-NEXT: flat_store_byte v[14:15], v96 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: flat_store_byte v[14:15], v113 offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[14:15], v83 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: flat_store_byte v[14:15], v52 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v20 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: flat_store_byte v[14:15], v68 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; ALIGNED-NEXT: flat_store_byte v[14:15], v114 offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: flat_store_byte v[14:15], v97 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: flat_store_byte v[14:15], v51 offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v18 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[14:15], v84 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; ALIGNED-NEXT: flat_store_byte v[14:15], v115 offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v11 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[14:15], v69 offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: flat_store_byte v[14:15], v50 offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:147 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:145 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:143 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:141 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:139 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:137 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:133 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:131 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:129 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:127 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:121 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:117 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:115 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:113 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:111 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:109 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:107 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:105 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:95 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:93 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:91 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:89 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:79 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:77 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:75 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:73 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:63 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:61 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:59 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:57 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:43 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:47 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 +; ALIGNED-NEXT: flat_store_byte v[14:15], v54 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[14:15], v100 offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; ALIGNED-NEXT: flat_store_byte v[14:15], v85 offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v6 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 8, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; ALIGNED-NEXT: flat_store_byte v[14:15], v81 offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:7 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:5 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:3 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 +; ALIGNED-NEXT: flat_store_byte v[14:15], v33 offset:151 +; ALIGNED-NEXT: flat_store_byte v[14:15], v55 offset:149 +; ALIGNED-NEXT: flat_store_byte v[14:15], v80 offset:147 +; ALIGNED-NEXT: flat_store_byte v[14:15], v38 offset:145 +; ALIGNED-NEXT: flat_store_byte v[14:15], v101 offset:143 +; ALIGNED-NEXT: flat_store_byte v[14:15], v71 offset:141 +; ALIGNED-NEXT: flat_store_byte v[14:15], v67 offset:139 +; ALIGNED-NEXT: flat_store_byte v[14:15], v64 offset:137 +; ALIGNED-NEXT: flat_store_byte v[14:15], v32 offset:135 +; ALIGNED-NEXT: flat_store_byte v[14:15], v39 offset:133 +; ALIGNED-NEXT: flat_store_byte v[14:15], v66 offset:131 +; ALIGNED-NEXT: flat_store_byte v[14:15], v34 offset:129 +; ALIGNED-NEXT: flat_store_byte v[14:15], v102 offset:127 +; ALIGNED-NEXT: flat_store_byte v[14:15], v65 offset:125 +; ALIGNED-NEXT: flat_store_byte v[14:15], v98 offset:123 +; ALIGNED-NEXT: flat_store_byte v[14:15], v48 offset:121 +; ALIGNED-NEXT: flat_store_byte v[14:15], v31 offset:119 +; ALIGNED-NEXT: flat_store_byte v[14:15], v35 offset:117 +; ALIGNED-NEXT: flat_store_byte v[14:15], v87 offset:115 +; ALIGNED-NEXT: flat_store_byte v[14:15], v27 offset:113 +; ALIGNED-NEXT: flat_store_byte v[14:15], v103 offset:111 +; ALIGNED-NEXT: flat_store_byte v[14:15], v49 offset:109 +; ALIGNED-NEXT: flat_store_byte v[14:15], v86 offset:107 +; ALIGNED-NEXT: flat_store_byte v[14:15], v36 offset:105 +; ALIGNED-NEXT: flat_store_byte v[14:15], v26 offset:103 +; ALIGNED-NEXT: flat_store_byte v[14:15], v28 offset:101 +; ALIGNED-NEXT: flat_store_byte v[14:15], v82 offset:99 +; ALIGNED-NEXT: flat_store_byte v[14:15], v22 offset:97 +; ALIGNED-NEXT: flat_store_byte v[14:15], v112 offset:95 +; ALIGNED-NEXT: flat_store_byte v[14:15], v37 offset:93 +; ALIGNED-NEXT: flat_store_byte v[14:15], v99 offset:91 +; ALIGNED-NEXT: flat_store_byte v[14:15], v29 offset:89 +; ALIGNED-NEXT: flat_store_byte v[14:15], v53 offset:87 +; ALIGNED-NEXT: flat_store_byte v[14:15], v23 offset:85 +; ALIGNED-NEXT: flat_store_byte v[14:15], v96 offset:83 +; ALIGNED-NEXT: flat_store_byte v[14:15], v19 offset:81 +; ALIGNED-NEXT: flat_store_byte v[14:15], v113 offset:79 +; ALIGNED-NEXT: flat_store_byte v[14:15], v30 offset:77 +; ALIGNED-NEXT: flat_store_byte v[14:15], v83 offset:75 +; ALIGNED-NEXT: flat_store_byte v[14:15], v24 offset:73 +; ALIGNED-NEXT: flat_store_byte v[14:15], v52 offset:71 +; ALIGNED-NEXT: flat_store_byte v[14:15], v20 offset:69 +; ALIGNED-NEXT: flat_store_byte v[14:15], v68 offset:67 +; ALIGNED-NEXT: flat_store_byte v[14:15], v17 offset:65 +; ALIGNED-NEXT: flat_store_byte v[14:15], v114 offset:63 +; ALIGNED-NEXT: flat_store_byte v[14:15], v25 offset:61 +; ALIGNED-NEXT: flat_store_byte v[14:15], v97 offset:59 +; ALIGNED-NEXT: flat_store_byte v[14:15], v21 offset:57 +; ALIGNED-NEXT: flat_store_byte v[14:15], v51 offset:55 +; ALIGNED-NEXT: flat_store_byte v[14:15], v18 offset:53 +; ALIGNED-NEXT: flat_store_byte v[14:15], v84 offset:51 +; ALIGNED-NEXT: flat_store_byte v[14:15], v16 offset:49 +; ALIGNED-NEXT: flat_store_byte v[14:15], v115 offset:43 +; ALIGNED-NEXT: flat_store_byte v[14:15], v11 offset:41 +; ALIGNED-NEXT: flat_store_byte v[14:15], v69 offset:47 +; ALIGNED-NEXT: flat_store_byte v[14:15], v10 offset:45 +; ALIGNED-NEXT: flat_store_byte v[14:15], v50 offset:35 +; ALIGNED-NEXT: flat_store_byte v[14:15], v9 offset:33 +; ALIGNED-NEXT: flat_store_byte v[14:15], v54 offset:39 +; ALIGNED-NEXT: flat_store_byte v[14:15], v8 offset:37 +; ALIGNED-NEXT: flat_store_byte v[14:15], v100 offset:31 +; ALIGNED-NEXT: flat_store_byte v[14:15], v7 offset:29 +; ALIGNED-NEXT: flat_store_byte v[14:15], v85 offset:27 +; ALIGNED-NEXT: flat_store_byte v[14:15], v6 offset:25 +; ALIGNED-NEXT: flat_store_byte v[14:15], v81 offset:23 +; ALIGNED-NEXT: flat_store_byte v[14:15], v5 offset:21 +; ALIGNED-NEXT: flat_store_byte v[14:15], v70 offset:19 +; ALIGNED-NEXT: flat_store_byte v[14:15], v4 offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v3 offset:14 +; ALIGNED-NEXT: flat_store_byte v[14:15], v3 offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v2 offset:10 +; ALIGNED-NEXT: flat_store_byte v[14:15], v2 offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v1 offset:6 +; ALIGNED-NEXT: flat_store_byte v[14:15], v1 offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v0 offset:2 +; ALIGNED-NEXT: flat_store_byte v[14:15], v0 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 8, v3 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; ALIGNED-NEXT: flat_store_byte v[14:15], v4 offset:15 +; ALIGNED-NEXT: flat_store_byte v[14:15], v3 offset:13 +; ALIGNED-NEXT: flat_store_byte v[14:15], v5 offset:11 +; ALIGNED-NEXT: flat_store_byte v[14:15], v2 offset:9 +; ALIGNED-NEXT: flat_store_byte v[14:15], v6 offset:7 +; ALIGNED-NEXT: flat_store_byte v[14:15], v1 offset:5 +; ALIGNED-NEXT: flat_store_byte v[14:15], v7 offset:3 +; ALIGNED-NEXT: flat_store_byte v[14:15], v0 offset:1 +; ALIGNED-NEXT: v_add_co_u32 v14, vcc_lo, 0xffffff00, v14 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v15, null, -1, v15, vcc_lo ; ALIGNED-NEXT: s_cbranch_scc0 .LBB5_5 -; ALIGNED-NEXT: .LBB5_6: ; %Flow6 -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: .LBB5_6: ; %Flow16 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; ALIGNED-NEXT: s_clause 0x9 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -6817,27 +6827,31 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s4 ; UNROLL3-NEXT: s_cbranch_execz .LBB5_4 ; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: v_mov_b32_e32 v5, v3 +; UNROLL3-NEXT: v_mov_b32_e32 v7, v1 +; UNROLL3-NEXT: v_mov_b32_e32 v4, v2 +; UNROLL3-NEXT: v_mov_b32_e32 v6, v0 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7e0 ; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB5_2: ; %memmove_fwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 -; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo -; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; UNROLL3-NEXT: s_clause 0x2 -; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[12:13] -; UNROLL3-NEXT: flat_load_dwordx4 v[8:11], v[12:13] offset:16 -; UNROLL3-NEXT: flat_load_dwordx4 v[12:15], v[12:13] offset:32 -; UNROLL3-NEXT: s_add_u32 s4, s4, 48 -; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: flat_load_dwordx4 v[8:11], v[4:5] +; UNROLL3-NEXT: flat_load_dwordx4 v[12:15], v[4:5] offset:16 +; UNROLL3-NEXT: flat_load_dwordx4 v[16:19], v[4:5] offset:32 +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, v4, 48 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 +; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 ; UNROLL3-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[8:11] ; UNROLL3-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[12:15] offset:16 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 -; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0x7e0 +; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[16:19] offset:32 +; UNROLL3-NEXT: v_add_co_u32 v6, vcc_lo, v6, 48 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc1 .LBB5_2 ; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual ; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:2016 @@ -6846,44 +6860,47 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; UNROLL3-NEXT: flat_load_dwordx4 v[2:5], v[2:3] offset:2032 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:2032 -; UNROLL3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; UNROLL3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; UNROLL3-NEXT: .LBB5_4: ; %Flow3 -; UNROLL3-NEXT: s_andn2_saveexec_b32 s8, s6 +; UNROLL3-NEXT: ; implicit-def: $vgpr2 +; UNROLL3-NEXT: ; implicit-def: $vgpr0 +; UNROLL3-NEXT: .LBB5_4: ; %Flow13 +; UNROLL3-NEXT: s_andn2_saveexec_b32 s6, s6 ; UNROLL3-NEXT: s_cbranch_execz .LBB5_7 ; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual ; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:2032 -; UNROLL3-NEXT: s_movk_i32 s6, 0xffd0 -; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7b0 -; UNROLL3-NEXT: s_mov_b32 s7, -1 +; UNROLL3-NEXT: s_movk_i32 s4, 0xf820 +; UNROLL3-NEXT: s_mov_b32 s5, -1 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2032 -; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:2016 +; UNROLL3-NEXT: flat_load_dwordx4 v[6:9], v[2:3] offset:2016 +; UNROLL3-NEXT: v_add_co_u32 v2, vcc_lo, 0x7b0, v2 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0x7b0, v0 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo ; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2016 +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:2016 ; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB5_6: ; %memmove_bwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 -; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo -; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; UNROLL3-NEXT: s_clause 0x2 -; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[12:13] -; UNROLL3-NEXT: flat_load_dwordx4 v[8:11], v[12:13] offset:16 -; UNROLL3-NEXT: flat_load_dwordx4 v[12:15], v[12:13] offset:32 -; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 -; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 +; UNROLL3-NEXT: flat_load_dwordx4 v[6:9], v[2:3] +; UNROLL3-NEXT: flat_load_dwordx4 v[10:13], v[2:3] offset:16 +; UNROLL3-NEXT: flat_load_dwordx4 v[14:17], v[2:3] offset:32 +; UNROLL3-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffffd0, v2 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 ; UNROLL3-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[6:9] ; UNROLL3-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[10:13] offset:16 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 -; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[14:17] offset:32 +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffffd0, v4 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, vcc_lo +; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc0 .LBB5_6 -; UNROLL3-NEXT: .LBB5_7: ; %Flow4 -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; UNROLL3-NEXT: .LBB5_7: ; %Flow14 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) ; UNROLL3-NEXT: s_setpc_b64 s[30:31] entry: @@ -6900,756 +6917,759 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execz .LBB6_3 ; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0x800 ; CHECK-NEXT: .LBB6_2: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo -; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:224 -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:240 -; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:192 -; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:208 -; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:160 -; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:176 -; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:128 -; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:144 -; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:96 -; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:112 -; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:64 -; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:80 -; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:32 -; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:48 -; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off -; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off offset:16 -; CHECK-NEXT: s_add_u32 s4, s4, 0x100 -; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:224 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:240 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:192 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:208 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:160 +; CHECK-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:176 +; CHECK-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:128 +; CHECK-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:144 +; CHECK-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:96 +; CHECK-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:112 +; CHECK-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:64 +; CHECK-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:80 +; CHECK-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:32 +; CHECK-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:48 +; CHECK-NEXT: global_load_dwordx4 v[84:87], v[2:3], off +; CHECK-NEXT: global_load_dwordx4 v[96:99], v[2:3], off offset:16 +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: s_addc_u32 s5, s5, -1 ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[4:7], off offset:224 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:224 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[8:11], off offset:240 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[12:15], off offset:192 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[12:15], off offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[16:19], off offset:208 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[16:19], off offset:208 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[20:23], off offset:160 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[20:23], off offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[24:27], off offset:176 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[24:27], off offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[28:31], off offset:128 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[28:31], off offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[32:35], off offset:144 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[32:35], off offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[36:39], off offset:96 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[36:39], off offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[48:51], off offset:112 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[48:51], off offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[52:55], off offset:64 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[52:55], off offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[64:67], off offset:80 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[64:67], off offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[68:71], off offset:32 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[68:71], off offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[80:83], off offset:48 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[80:83], off offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[84:87], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[84:87], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[96:99], off offset:16 -; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[96:99], off offset:16 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB6_2 -; CHECK-NEXT: .LBB6_3: ; %Flow7 -; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6 +; CHECK-NEXT: .LBB6_3: ; %Flow17 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6 ; CHECK-NEXT: s_cbranch_execz .LBB6_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader -; CHECK-NEXT: s_movk_i32 s6, 0xff00 -; CHECK-NEXT: s_mov_b64 s[4:5], 0x700 -; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0x700, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x700, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; CHECK-NEXT: s_movk_i32 s4, 0xf800 +; CHECK-NEXT: s_mov_b32 s5, -1 ; CHECK-NEXT: .LBB6_5: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo -; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:224 -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:240 -; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:192 -; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:208 -; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:160 -; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:176 -; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:128 -; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:144 -; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:96 -; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:112 -; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:64 -; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:80 -; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:32 -; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:48 -; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off -; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off offset:16 -; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 -; CHECK-NEXT: s_addc_u32 s5, s5, -1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:224 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:240 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:192 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:208 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:160 +; CHECK-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:176 +; CHECK-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:128 +; CHECK-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:144 +; CHECK-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:96 +; CHECK-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:112 +; CHECK-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:64 +; CHECK-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:80 +; CHECK-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:32 +; CHECK-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:48 +; CHECK-NEXT: global_load_dwordx4 v[84:87], v[2:3], off +; CHECK-NEXT: global_load_dwordx4 v[96:99], v[2:3], off offset:16 +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff00, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[4:7], off offset:224 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:224 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[8:11], off offset:240 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[12:15], off offset:192 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[12:15], off offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[16:19], off offset:208 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[16:19], off offset:208 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[20:23], off offset:160 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[20:23], off offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[24:27], off offset:176 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[24:27], off offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[28:31], off offset:128 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[28:31], off offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[32:35], off offset:144 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[32:35], off offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[36:39], off offset:96 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[36:39], off offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[48:51], off offset:112 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[48:51], off offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[52:55], off offset:64 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[52:55], off offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[64:67], off offset:80 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[64:67], off offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[68:71], off offset:32 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[68:71], off offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[80:83], off offset:48 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[80:83], off offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[84:87], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[84:87], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[96:99], off offset:16 -; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[96:99], off offset:16 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffff00, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB6_5 -; CHECK-NEXT: .LBB6_6: ; %Flow8 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: .LBB6_6: ; %Flow18 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] ; ; ALIGNED-LABEL: memmove_p1_p1_sz2048: ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_mov_b32 s4, exec_lo ; ALIGNED-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 ; ALIGNED-NEXT: s_cbranch_execz .LBB6_3 ; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x800 ; ALIGNED-NEXT: .LBB6_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, s5, v3, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[20:21], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[22:25], v[20:21], off offset:224 -; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[20:21], off -; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[20:21], off offset:16 -; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[20:21], off offset:32 -; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[20:21], off offset:48 -; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[20:21], off offset:64 -; ALIGNED-NEXT: global_load_dwordx4 v[82:85], v[20:21], off offset:80 -; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[20:21], off offset:96 -; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[20:21], off offset:112 -; ALIGNED-NEXT: global_load_dwordx4 v[40:43], v[20:21], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[50:53], v[20:21], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[20:21], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[34:37], v[20:21], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[30:33], v[20:21], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[20:21], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[2:3], off +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16 +; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:32 +; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[2:3], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[2:3], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:208 +; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 +; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo -; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 -; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v21, off offset:254 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:252 +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v19, off offset:254 +; ALIGNED-NEXT: global_store_byte v[0:1], v19, off offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v20, off offset:250 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:248 +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v18, off offset:250 +; ALIGNED-NEXT: global_store_byte v[0:1], v18, off offset:248 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v19, off offset:246 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:244 +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v17, off offset:246 +; ALIGNED-NEXT: global_store_byte v[0:1], v17, off offset:244 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v18, off offset:242 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:240 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v16, off offset:242 +; ALIGNED-NEXT: global_store_byte v[0:1], v16, off offset:240 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v23, off offset:238 +; ALIGNED-NEXT: global_store_byte v[0:1], v23, off offset:236 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v22, off offset:234 +; ALIGNED-NEXT: global_store_byte v[0:1], v22, off offset:232 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v21, off offset:230 +; ALIGNED-NEXT: global_store_byte v[0:1], v21, off offset:228 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v20, off offset:226 +; ALIGNED-NEXT: global_store_byte v[0:1], v20, off offset:224 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v21 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v20 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v25, off offset:238 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:236 +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v27, off offset:222 +; ALIGNED-NEXT: global_store_byte v[0:1], v27, off offset:220 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v24, off offset:234 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:232 +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v26, off offset:218 +; ALIGNED-NEXT: global_store_byte v[0:1], v26, off offset:216 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v23, off offset:230 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:228 +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v25, off offset:214 +; ALIGNED-NEXT: global_store_byte v[0:1], v25, off offset:212 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v22, off offset:226 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:224 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v29, off offset:222 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:220 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v28, off offset:218 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:216 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v27, off offset:214 -; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:212 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v26, off offset:210 -; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:208 -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v24, off offset:210 +; ALIGNED-NEXT: global_store_byte v[0:1], v24, off offset:208 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v33, off offset:206 -; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v32, off offset:202 -; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v31, off offset:198 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v30, off offset:194 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:192 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v31, off offset:206 +; ALIGNED-NEXT: global_store_byte v[0:1], v31, off offset:204 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v30, off offset:202 +; ALIGNED-NEXT: global_store_byte v[0:1], v30, off offset:200 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v29, off offset:198 +; ALIGNED-NEXT: global_store_byte v[0:1], v29, off offset:196 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v28, off offset:194 +; ALIGNED-NEXT: global_store_byte v[0:1], v28, off offset:192 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v37, off offset:190 -; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v36, off offset:186 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v35, off offset:182 -; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v34, off offset:178 -; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:176 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v35, off offset:190 +; ALIGNED-NEXT: global_store_byte v[0:1], v35, off offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v34, off offset:186 +; ALIGNED-NEXT: global_store_byte v[0:1], v34, off offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v33, off offset:182 +; ALIGNED-NEXT: global_store_byte v[0:1], v33, off offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v32, off offset:178 +; ALIGNED-NEXT: global_store_byte v[0:1], v32, off offset:176 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v49, off offset:174 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:172 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v48, off offset:170 -; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:168 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v39, off offset:166 -; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:164 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v38, off offset:162 -; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:160 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v39, off offset:174 +; ALIGNED-NEXT: global_store_byte v[0:1], v39, off offset:172 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v38, off offset:170 +; ALIGNED-NEXT: global_store_byte v[0:1], v38, off offset:168 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v37, off offset:166 +; ALIGNED-NEXT: global_store_byte v[0:1], v37, off offset:164 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v36, off offset:162 +; ALIGNED-NEXT: global_store_byte v[0:1], v36, off offset:160 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v53, off offset:158 -; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v52, off offset:154 -; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:152 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v51, off offset:150 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:148 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v50, off offset:146 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:144 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v51, off offset:158 +; ALIGNED-NEXT: global_store_byte v[0:1], v51, off offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v50, off offset:154 +; ALIGNED-NEXT: global_store_byte v[0:1], v50, off offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v49, off offset:150 +; ALIGNED-NEXT: global_store_byte v[0:1], v49, off offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v48, off offset:146 +; ALIGNED-NEXT: global_store_byte v[0:1], v48, off offset:144 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v65, off offset:142 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v64, off offset:138 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v55, off offset:134 -; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v54, off offset:130 -; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:128 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v55, off offset:142 +; ALIGNED-NEXT: global_store_byte v[0:1], v55, off offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v54, off offset:138 +; ALIGNED-NEXT: global_store_byte v[0:1], v54, off offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v53, off offset:134 +; ALIGNED-NEXT: global_store_byte v[0:1], v53, off offset:132 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v52, off offset:130 +; ALIGNED-NEXT: global_store_byte v[0:1], v52, off offset:128 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v69, off offset:126 -; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v68, off offset:122 -; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:120 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v67, off offset:118 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:116 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v66, off offset:114 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:112 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v67, off offset:126 +; ALIGNED-NEXT: global_store_byte v[0:1], v67, off offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v66, off offset:122 +; ALIGNED-NEXT: global_store_byte v[0:1], v66, off offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v65, off offset:118 +; ALIGNED-NEXT: global_store_byte v[0:1], v65, off offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v64, off offset:114 +; ALIGNED-NEXT: global_store_byte v[0:1], v64, off offset:112 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v81, off offset:110 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v80, off offset:106 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v71, off offset:102 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v70, off offset:98 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:96 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v71, off offset:110 +; ALIGNED-NEXT: global_store_byte v[0:1], v71, off offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v70, off offset:106 +; ALIGNED-NEXT: global_store_byte v[0:1], v70, off offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v69, off offset:102 +; ALIGNED-NEXT: global_store_byte v[0:1], v69, off offset:100 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v68, off offset:98 +; ALIGNED-NEXT: global_store_byte v[0:1], v68, off offset:96 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:188 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v85, off offset:94 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:92 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v84, off offset:90 -; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:88 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v83, off offset:86 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:84 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v82, off offset:82 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:80 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v83, off offset:94 +; ALIGNED-NEXT: global_store_byte v[0:1], v83, off offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v82, off offset:90 +; ALIGNED-NEXT: global_store_byte v[0:1], v82, off offset:88 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v81, off offset:86 +; ALIGNED-NEXT: global_store_byte v[0:1], v81, off offset:84 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v80, off offset:82 +; ALIGNED-NEXT: global_store_byte v[0:1], v80, off offset:80 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v97, off offset:78 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:76 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v96, off offset:74 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:72 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v87, off offset:70 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:68 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v86, off offset:66 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:64 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v87, off offset:78 +; ALIGNED-NEXT: global_store_byte v[0:1], v87, off offset:76 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v86, off offset:74 +; ALIGNED-NEXT: global_store_byte v[0:1], v86, off offset:72 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v85, off offset:70 +; ALIGNED-NEXT: global_store_byte v[0:1], v85, off offset:68 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v84, off offset:66 +; ALIGNED-NEXT: global_store_byte v[0:1], v84, off offset:64 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:284 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v101, off offset:62 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:60 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v100, off offset:58 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:56 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v99, off offset:54 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:52 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v98, off offset:50 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v99, off offset:62 +; ALIGNED-NEXT: global_store_byte v[0:1], v99, off offset:60 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v98, off offset:58 +; ALIGNED-NEXT: global_store_byte v[0:1], v98, off offset:56 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v97, off offset:54 +; ALIGNED-NEXT: global_store_byte v[0:1], v97, off offset:52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v96, off offset:50 +; ALIGNED-NEXT: global_store_byte v[0:1], v96, off offset:48 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:42 -; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:40 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:46 -; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:44 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:34 -; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:38 -; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v15, off offset:42 +; ALIGNED-NEXT: global_store_byte v[0:1], v15, off offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v14, off offset:46 +; ALIGNED-NEXT: global_store_byte v[0:1], v14, off offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v13, off offset:34 +; ALIGNED-NEXT: global_store_byte v[0:1], v13, off offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v12, off offset:38 +; ALIGNED-NEXT: global_store_byte v[0:1], v12, off offset:36 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v11, off offset:30 -; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:28 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v10, off offset:26 -; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:24 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v9, off offset:22 -; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:20 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v8, off offset:18 -; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v11, off offset:30 +; ALIGNED-NEXT: global_store_byte v[0:1], v11, off offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v10, off offset:26 +; ALIGNED-NEXT: global_store_byte v[0:1], v10, off offset:24 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v9, off offset:22 +; ALIGNED-NEXT: global_store_byte v[0:1], v9, off offset:20 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v8, off offset:18 +; ALIGNED-NEXT: global_store_byte v[0:1], v8, off offset:16 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v27 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v29 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: global_store_byte v[0:1], v100, off offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v27 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:245 +; ALIGNED-NEXT: global_store_byte v[0:1], v19, off offset:253 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v26 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:243 +; ALIGNED-NEXT: global_store_byte v[0:1], v101, off offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 +; ALIGNED-NEXT: global_store_byte v[0:1], v27, off offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: global_store_byte v[0:1], v18, off offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: global_store_byte v[0:1], v102, off offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: global_store_byte v[0:1], v17, off offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: global_store_byte v[0:1], v103, off offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: global_store_byte v[0:1], v16, off offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: global_store_byte v[0:1], v112, off offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 +; ALIGNED-NEXT: global_store_byte v[0:1], v23, off offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; ALIGNED-NEXT: global_store_byte v[0:1], v113, off offset:235 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v32 +; ALIGNED-NEXT: global_store_byte v[0:1], v22, off offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v32 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:235 +; ALIGNED-NEXT: global_store_byte v[0:1], v114, off offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: global_store_byte v[0:1], v21, off offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: global_store_byte v[0:1], v115, off offset:227 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v37 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v36 +; ALIGNED-NEXT: global_store_byte v[0:1], v20, off offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v36 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v34 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v49 +; ALIGNED-NEXT: global_store_byte v[0:1], v100, off offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: global_store_byte v[0:1], v19, off offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v49 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v48 +; ALIGNED-NEXT: global_store_byte v[0:1], v26, off offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v48 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v65 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v64 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v8 +; ALIGNED-NEXT: global_store_byte v[0:1], v101, off offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v55 +; ALIGNED-NEXT: global_store_byte v[0:1], v50, off offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v54 +; ALIGNED-NEXT: global_store_byte v[0:1], v25, off offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v54 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v69 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v67 +; ALIGNED-NEXT: global_store_byte v[0:1], v18, off offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: global_store_byte v[0:1], v24, off offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: global_store_byte v[0:1], v102, off offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:197 +; ALIGNED-NEXT: global_store_byte v[0:1], v31, off offset:205 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v66 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v71 +; ALIGNED-NEXT: global_store_byte v[0:1], v17, off offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: global_store_byte v[0:1], v30, off offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: global_store_byte v[0:1], v103, off offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v70 +; ALIGNED-NEXT: global_store_byte v[0:1], v29, off offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v70 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v83 +; ALIGNED-NEXT: global_store_byte v[0:1], v16, off offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 +; ALIGNED-NEXT: global_store_byte v[0:1], v28, off offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 +; ALIGNED-NEXT: global_store_byte v[0:1], v112, off offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v83 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:181 +; ALIGNED-NEXT: global_store_byte v[0:1], v35, off offset:189 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v82 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v87 +; ALIGNED-NEXT: global_store_byte v[0:1], v23, off offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: global_store_byte v[0:1], v34, off offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: global_store_byte v[0:1], v113, off offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v86 +; ALIGNED-NEXT: global_store_byte v[0:1], v33, off offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v86 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v99 +; ALIGNED-NEXT: global_store_byte v[0:1], v22, off offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: global_store_byte v[0:1], v32, off offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: global_store_byte v[0:1], v114, off offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v99 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:165 +; ALIGNED-NEXT: global_store_byte v[0:1], v39, off offset:173 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v98 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v15 +; ALIGNED-NEXT: global_store_byte v[0:1], v21, off offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: global_store_byte v[0:1], v38, off offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: global_store_byte v[0:1], v115, off offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v14 +; ALIGNED-NEXT: global_store_byte v[0:1], v37, off offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v14 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:159 +; ALIGNED-NEXT: global_store_byte v[0:1], v20, off offset:163 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v12 +; ALIGNED-NEXT: global_store_byte v[0:1], v36, off offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v12 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 +; ALIGNED-NEXT: global_store_byte v[0:1], v100, off offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v11 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; ALIGNED-NEXT: global_store_byte v[0:1], v51, off offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: global_store_byte v[0:1], v27, off offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:147 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:145 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:143 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:141 -; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:139 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:137 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:135 -; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:133 -; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:131 -; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:129 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:127 -; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:125 -; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:123 -; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:121 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:119 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:117 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:115 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:113 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:111 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:109 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:107 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:105 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:103 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:101 -; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:99 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:97 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:95 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:93 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:91 -; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:89 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:87 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:85 -; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:83 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:81 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:79 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:77 -; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:75 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:73 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:71 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:69 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:67 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:65 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:63 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:61 -; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:59 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:57 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:55 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:53 -; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:51 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:49 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:43 -; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:41 -; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:47 -; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:45 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:35 -; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:33 -; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:39 -; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:37 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:31 -; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:29 -; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:27 -; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:25 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:23 -; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:21 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:19 -; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v7, off offset:14 -; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v6, off offset:10 -; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v5, off offset:6 -; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v4, off offset:2 -; ALIGNED-NEXT: global_store_byte v[16:17], v4, off +; ALIGNED-NEXT: global_store_byte v[0:1], v19, off offset:151 +; ALIGNED-NEXT: global_store_byte v[0:1], v49, off offset:149 +; ALIGNED-NEXT: global_store_byte v[0:1], v26, off offset:147 +; ALIGNED-NEXT: global_store_byte v[0:1], v48, off offset:145 +; ALIGNED-NEXT: global_store_byte v[0:1], v101, off offset:143 +; ALIGNED-NEXT: global_store_byte v[0:1], v55, off offset:141 +; ALIGNED-NEXT: global_store_byte v[0:1], v25, off offset:139 +; ALIGNED-NEXT: global_store_byte v[0:1], v54, off offset:137 +; ALIGNED-NEXT: global_store_byte v[0:1], v18, off offset:135 +; ALIGNED-NEXT: global_store_byte v[0:1], v53, off offset:133 +; ALIGNED-NEXT: global_store_byte v[0:1], v24, off offset:131 +; ALIGNED-NEXT: global_store_byte v[0:1], v52, off offset:129 +; ALIGNED-NEXT: global_store_byte v[0:1], v102, off offset:127 +; ALIGNED-NEXT: global_store_byte v[0:1], v67, off offset:125 +; ALIGNED-NEXT: global_store_byte v[0:1], v31, off offset:123 +; ALIGNED-NEXT: global_store_byte v[0:1], v66, off offset:121 +; ALIGNED-NEXT: global_store_byte v[0:1], v17, off offset:119 +; ALIGNED-NEXT: global_store_byte v[0:1], v65, off offset:117 +; ALIGNED-NEXT: global_store_byte v[0:1], v30, off offset:115 +; ALIGNED-NEXT: global_store_byte v[0:1], v64, off offset:113 +; ALIGNED-NEXT: global_store_byte v[0:1], v103, off offset:111 +; ALIGNED-NEXT: global_store_byte v[0:1], v71, off offset:109 +; ALIGNED-NEXT: global_store_byte v[0:1], v29, off offset:107 +; ALIGNED-NEXT: global_store_byte v[0:1], v70, off offset:105 +; ALIGNED-NEXT: global_store_byte v[0:1], v16, off offset:103 +; ALIGNED-NEXT: global_store_byte v[0:1], v69, off offset:101 +; ALIGNED-NEXT: global_store_byte v[0:1], v28, off offset:99 +; ALIGNED-NEXT: global_store_byte v[0:1], v68, off offset:97 +; ALIGNED-NEXT: global_store_byte v[0:1], v112, off offset:95 +; ALIGNED-NEXT: global_store_byte v[0:1], v83, off offset:93 +; ALIGNED-NEXT: global_store_byte v[0:1], v35, off offset:91 +; ALIGNED-NEXT: global_store_byte v[0:1], v82, off offset:89 +; ALIGNED-NEXT: global_store_byte v[0:1], v23, off offset:87 +; ALIGNED-NEXT: global_store_byte v[0:1], v81, off offset:85 +; ALIGNED-NEXT: global_store_byte v[0:1], v34, off offset:83 +; ALIGNED-NEXT: global_store_byte v[0:1], v80, off offset:81 +; ALIGNED-NEXT: global_store_byte v[0:1], v113, off offset:79 +; ALIGNED-NEXT: global_store_byte v[0:1], v87, off offset:77 +; ALIGNED-NEXT: global_store_byte v[0:1], v33, off offset:75 +; ALIGNED-NEXT: global_store_byte v[0:1], v86, off offset:73 +; ALIGNED-NEXT: global_store_byte v[0:1], v22, off offset:71 +; ALIGNED-NEXT: global_store_byte v[0:1], v85, off offset:69 +; ALIGNED-NEXT: global_store_byte v[0:1], v32, off offset:67 +; ALIGNED-NEXT: global_store_byte v[0:1], v84, off offset:65 +; ALIGNED-NEXT: global_store_byte v[0:1], v114, off offset:63 +; ALIGNED-NEXT: global_store_byte v[0:1], v99, off offset:61 +; ALIGNED-NEXT: global_store_byte v[0:1], v39, off offset:59 +; ALIGNED-NEXT: global_store_byte v[0:1], v98, off offset:57 +; ALIGNED-NEXT: global_store_byte v[0:1], v21, off offset:55 +; ALIGNED-NEXT: global_store_byte v[0:1], v97, off offset:53 +; ALIGNED-NEXT: global_store_byte v[0:1], v38, off offset:51 +; ALIGNED-NEXT: global_store_byte v[0:1], v96, off offset:49 +; ALIGNED-NEXT: global_store_byte v[0:1], v115, off offset:43 +; ALIGNED-NEXT: global_store_byte v[0:1], v15, off offset:41 +; ALIGNED-NEXT: global_store_byte v[0:1], v37, off offset:47 +; ALIGNED-NEXT: global_store_byte v[0:1], v14, off offset:45 +; ALIGNED-NEXT: global_store_byte v[0:1], v20, off offset:35 +; ALIGNED-NEXT: global_store_byte v[0:1], v13, off offset:33 +; ALIGNED-NEXT: global_store_byte v[0:1], v36, off offset:39 +; ALIGNED-NEXT: global_store_byte v[0:1], v12, off offset:37 +; ALIGNED-NEXT: global_store_byte v[0:1], v100, off offset:31 +; ALIGNED-NEXT: global_store_byte v[0:1], v11, off offset:29 +; ALIGNED-NEXT: global_store_byte v[0:1], v51, off offset:27 +; ALIGNED-NEXT: global_store_byte v[0:1], v10, off offset:25 +; ALIGNED-NEXT: global_store_byte v[0:1], v27, off offset:23 +; ALIGNED-NEXT: global_store_byte v[0:1], v9, off offset:21 +; ALIGNED-NEXT: global_store_byte v[0:1], v50, off offset:19 +; ALIGNED-NEXT: global_store_byte v[0:1], v8, off offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v7, off offset:14 +; ALIGNED-NEXT: global_store_byte v[0:1], v7, off offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v6, off offset:10 +; ALIGNED-NEXT: global_store_byte v[0:1], v6, off offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v5, off offset:6 +; ALIGNED-NEXT: global_store_byte v[0:1], v5, off offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v4, off offset:2 +; ALIGNED-NEXT: global_store_byte v[0:1], v4, off ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 @@ -7658,653 +7678,660 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:15 -; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:13 -; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:11 -; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:9 -; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:7 -; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:5 -; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:3 -; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1 +; ALIGNED-NEXT: global_store_byte v[0:1], v8, off offset:15 +; ALIGNED-NEXT: global_store_byte v[0:1], v7, off offset:13 +; ALIGNED-NEXT: global_store_byte v[0:1], v9, off offset:11 +; ALIGNED-NEXT: global_store_byte v[0:1], v6, off offset:9 +; ALIGNED-NEXT: global_store_byte v[0:1], v10, off offset:7 +; ALIGNED-NEXT: global_store_byte v[0:1], v5, off offset:5 +; ALIGNED-NEXT: global_store_byte v[0:1], v11, off offset:3 +; ALIGNED-NEXT: global_store_byte v[0:1], v4, off offset:1 +; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; ALIGNED-NEXT: s_cbranch_scc1 .LBB6_2 -; ALIGNED-NEXT: .LBB6_3: ; %Flow7 -; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 +; ALIGNED-NEXT: .LBB6_3: ; %Flow17 +; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6 ; ALIGNED-NEXT: s_cbranch_execz .LBB6_6 ; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader -; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 -; ALIGNED-NEXT: s_mov_b32 s7, -1 +; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x700, v2 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v14, vcc_lo, 0x700, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, vcc_lo +; ALIGNED-NEXT: s_movk_i32 s4, 0xf800 +; ALIGNED-NEXT: s_mov_b32 s5, -1 ; ALIGNED-NEXT: .LBB6_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[24:25], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[24:25], off offset:224 -; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[24:25], off -; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[24:25], off offset:16 -; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[24:25], off offset:32 -; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[24:25], off offset:48 -; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[24:25], off offset:64 -; ALIGNED-NEXT: global_load_dwordx4 v[40:43], v[24:25], off offset:80 -; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[24:25], off offset:96 -; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[24:25], off offset:112 -; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[24:25], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[24:25], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[24:25], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[81:84], v[24:25], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[24:25], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[24:25], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[12:13], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[12:13], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[0:3], v[12:13], off +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16 +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:32 +; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[12:13], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[12:13], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[12:13], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[40:43], v[12:13], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[12:13], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[34:37], v[12:13], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[12:13], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[54:57], v[12:13], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[12:13], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[82:85], v[12:13], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[12:13], off offset:208 +; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0xffffff00, v12 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:348 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo -; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 -; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v31, off offset:254 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:252 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v33, off offset:254 +; ALIGNED-NEXT: global_store_byte v[14:15], v33, off offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v30, off offset:250 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:248 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v32, off offset:250 +; ALIGNED-NEXT: global_store_byte v[14:15], v32, off offset:248 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v25, off offset:246 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:244 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v27, off offset:246 +; ALIGNED-NEXT: global_store_byte v[14:15], v27, off offset:244 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v24, off offset:242 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:240 -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v26, off offset:242 +; ALIGNED-NEXT: global_store_byte v[14:15], v26, off offset:240 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v53, off offset:238 +; ALIGNED-NEXT: global_store_byte v[14:15], v53, off offset:236 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v52, off offset:234 +; ALIGNED-NEXT: global_store_byte v[14:15], v52, off offset:232 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v51, off offset:230 +; ALIGNED-NEXT: global_store_byte v[14:15], v51, off offset:228 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v50, off offset:226 +; ALIGNED-NEXT: global_store_byte v[14:15], v50, off offset:224 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:348 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v81, off offset:222 +; ALIGNED-NEXT: global_store_byte v[14:15], v81, off offset:220 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v80, off offset:218 +; ALIGNED-NEXT: global_store_byte v[14:15], v80, off offset:216 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v67, off offset:214 +; ALIGNED-NEXT: global_store_byte v[14:15], v67, off offset:212 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v66, off offset:210 +; ALIGNED-NEXT: global_store_byte v[14:15], v66, off offset:208 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:320 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v51, off offset:238 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:236 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v98, off offset:206 +; ALIGNED-NEXT: global_store_byte v[14:15], v98, off offset:204 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v50, off offset:234 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:232 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v87, off offset:202 +; ALIGNED-NEXT: global_store_byte v[14:15], v87, off offset:200 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v49, off offset:230 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:228 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v86, off offset:198 +; ALIGNED-NEXT: global_store_byte v[14:15], v86, off offset:196 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v36, off offset:226 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:224 -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v82, off offset:194 +; ALIGNED-NEXT: global_store_byte v[14:15], v82, off offset:192 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:412 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v99, off offset:190 +; ALIGNED-NEXT: global_store_byte v[14:15], v99, off offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v96, off offset:186 +; ALIGNED-NEXT: global_store_byte v[14:15], v96, off offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v83, off offset:182 +; ALIGNED-NEXT: global_store_byte v[14:15], v83, off offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v68, off offset:178 +; ALIGNED-NEXT: global_store_byte v[14:15], v68, off offset:176 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:416 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v71, off offset:222 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:220 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v97, off offset:174 +; ALIGNED-NEXT: global_store_byte v[14:15], v97, off offset:172 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v70, off offset:218 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:216 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v84, off offset:170 +; ALIGNED-NEXT: global_store_byte v[14:15], v84, off offset:168 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v65, off offset:214 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:212 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v69, off offset:166 +; ALIGNED-NEXT: global_store_byte v[14:15], v69, off offset:164 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v64, off offset:210 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:208 -; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v54, off offset:162 +; ALIGNED-NEXT: global_store_byte v[14:15], v54, off offset:160 +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v85, off offset:158 +; ALIGNED-NEXT: global_store_byte v[14:15], v85, off offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v70, off offset:154 +; ALIGNED-NEXT: global_store_byte v[14:15], v70, off offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v55, off offset:150 +; ALIGNED-NEXT: global_store_byte v[14:15], v55, off offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v38, off offset:146 +; ALIGNED-NEXT: global_store_byte v[14:15], v38, off offset:144 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v71, off offset:142 +; ALIGNED-NEXT: global_store_byte v[14:15], v71, off offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v64, off offset:138 +; ALIGNED-NEXT: global_store_byte v[14:15], v64, off offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v39, off offset:134 +; ALIGNED-NEXT: global_store_byte v[14:15], v39, off offset:132 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v34, off offset:130 +; ALIGNED-NEXT: global_store_byte v[14:15], v34, off offset:128 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v65, off offset:126 +; ALIGNED-NEXT: global_store_byte v[14:15], v65, off offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v48, off offset:122 +; ALIGNED-NEXT: global_store_byte v[14:15], v48, off offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v35, off offset:118 +; ALIGNED-NEXT: global_store_byte v[14:15], v35, off offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v28, off offset:114 +; ALIGNED-NEXT: global_store_byte v[14:15], v28, off offset:112 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:480 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v87, off offset:206 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:204 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v49, off offset:110 +; ALIGNED-NEXT: global_store_byte v[14:15], v49, off offset:108 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v86, off offset:202 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:200 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v36, off offset:106 +; ALIGNED-NEXT: global_store_byte v[14:15], v36, off offset:104 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v85, off offset:198 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:196 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v29, off offset:102 +; ALIGNED-NEXT: global_store_byte v[14:15], v29, off offset:100 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v80, off offset:194 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:192 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v22, off offset:98 +; ALIGNED-NEXT: global_store_byte v[14:15], v22, off offset:96 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:444 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:432 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v101, off offset:190 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:188 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v37, off offset:94 +; ALIGNED-NEXT: global_store_byte v[14:15], v37, off offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v99, off offset:186 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:184 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v30, off offset:90 +; ALIGNED-NEXT: global_store_byte v[14:15], v30, off offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v96, off offset:182 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:180 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v23, off offset:86 +; ALIGNED-NEXT: global_store_byte v[14:15], v23, off offset:84 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v81, off offset:178 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:176 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v19, off offset:82 +; ALIGNED-NEXT: global_store_byte v[14:15], v19, off offset:80 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:460 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v100, off offset:174 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:172 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v31, off offset:78 +; ALIGNED-NEXT: global_store_byte v[14:15], v31, off offset:76 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v97, off offset:170 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:168 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v24, off offset:74 +; ALIGNED-NEXT: global_store_byte v[14:15], v24, off offset:72 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v82, off offset:166 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:164 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v20, off offset:70 +; ALIGNED-NEXT: global_store_byte v[14:15], v20, off offset:68 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v66, off offset:162 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:160 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v17, off offset:66 +; ALIGNED-NEXT: global_store_byte v[14:15], v17, off offset:64 +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:540 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:364 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v98, off offset:158 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:156 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v25, off offset:62 +; ALIGNED-NEXT: global_store_byte v[14:15], v25, off offset:60 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v83, off offset:154 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:152 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v21, off offset:58 +; ALIGNED-NEXT: global_store_byte v[14:15], v21, off offset:56 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v67, off offset:150 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:148 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v18, off offset:54 +; ALIGNED-NEXT: global_store_byte v[14:15], v18, off offset:52 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v52, off offset:146 -; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:144 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v16, off offset:50 +; ALIGNED-NEXT: global_store_byte v[14:15], v16, off offset:48 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:556 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:380 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:556 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:548 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v84, off offset:142 -; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:140 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v11, off offset:42 +; ALIGNED-NEXT: global_store_byte v[14:15], v11, off offset:40 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v68, off offset:138 -; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:136 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v10, off offset:46 +; ALIGNED-NEXT: global_store_byte v[14:15], v10, off offset:44 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v53, off offset:134 -; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:132 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v9, off offset:34 +; ALIGNED-NEXT: global_store_byte v[14:15], v9, off offset:32 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v37, off offset:130 -; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:128 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v8, off offset:38 +; ALIGNED-NEXT: global_store_byte v[14:15], v8, off offset:36 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v69, off offset:126 -; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:124 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v7, off offset:30 +; ALIGNED-NEXT: global_store_byte v[14:15], v7, off offset:28 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v54, off offset:122 -; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:120 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v6, off offset:26 +; ALIGNED-NEXT: global_store_byte v[14:15], v6, off offset:24 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v38, off offset:118 -; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:116 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v5, off offset:22 +; ALIGNED-NEXT: global_store_byte v[14:15], v5, off offset:20 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v32, off offset:114 -; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:112 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v4, off offset:18 +; ALIGNED-NEXT: global_store_byte v[14:15], v4, off offset:16 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:524 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:476 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v55, off offset:110 -; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v39, off offset:106 -; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v33, off offset:102 -; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v26, off offset:98 -; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:96 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v48, off offset:94 -; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:92 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v34, off offset:90 -; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:88 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v27, off offset:86 -; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:84 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v21, off offset:82 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:80 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v35, off offset:78 -; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:76 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v28, off offset:74 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:72 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v22, off offset:70 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:68 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v19, off offset:66 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:64 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:524 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:524 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v29, off offset:62 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:60 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v23, off offset:58 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:56 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v20, off offset:54 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:52 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v18, off offset:50 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:532 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:540 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:42 -; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:40 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:46 -; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:44 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:34 -; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:38 -; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v11, off offset:30 -; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:28 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v10, off offset:26 -; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:24 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v9, off offset:22 -; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:20 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v8, off offset:18 -; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: global_store_byte v[14:15], v100, off offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: global_store_byte v[14:15], v33, off offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: global_store_byte v[14:15], v101, off offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v67 +; ALIGNED-NEXT: global_store_byte v[14:15], v81, off offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v70 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 +; ALIGNED-NEXT: global_store_byte v[14:15], v32, off offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; ALIGNED-NEXT: global_store_byte v[14:15], v102, off offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: global_store_byte v[14:15], v27, off offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v87 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 +; ALIGNED-NEXT: global_store_byte v[14:15], v103, off offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v86 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 +; ALIGNED-NEXT: global_store_byte v[14:15], v26, off offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 +; ALIGNED-NEXT: global_store_byte v[14:15], v112, off offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 +; ALIGNED-NEXT: global_store_byte v[14:15], v53, off offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v96 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 +; ALIGNED-NEXT: global_store_byte v[14:15], v113, off offset:235 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v83 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: global_store_byte v[14:15], v52, off offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v68 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 +; ALIGNED-NEXT: global_store_byte v[14:15], v114, off offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: global_store_byte v[14:15], v51, off offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: global_store_byte v[14:15], v115, off offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v69 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 +; ALIGNED-NEXT: global_store_byte v[14:15], v50, off offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v54 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 +; ALIGNED-NEXT: global_store_byte v[14:15], v100, off offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: global_store_byte v[14:15], v33, off offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v55 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 +; ALIGNED-NEXT: global_store_byte v[14:15], v80, off offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: global_store_byte v[14:15], v101, off offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v71 +; ALIGNED-NEXT: global_store_byte v[14:15], v70, off offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: global_store_byte v[14:15], v67, off offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: global_store_byte v[14:15], v32, off offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v39 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 +; ALIGNED-NEXT: global_store_byte v[14:15], v66, off offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v34 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: global_store_byte v[14:15], v102, off offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: global_store_byte v[14:15], v98, off offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 +; ALIGNED-NEXT: global_store_byte v[14:15], v27, off offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v35 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; ALIGNED-NEXT: global_store_byte v[14:15], v87, off offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v28 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 +; ALIGNED-NEXT: global_store_byte v[14:15], v103, off offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: global_store_byte v[14:15], v86, off offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: global_store_byte v[14:15], v26, off offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v29 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 +; ALIGNED-NEXT: global_store_byte v[14:15], v82, off offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: global_store_byte v[14:15], v112, off offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: global_store_byte v[14:15], v99, off offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: global_store_byte v[14:15], v53, off offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v23 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 +; ALIGNED-NEXT: global_store_byte v[14:15], v96, off offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: global_store_byte v[14:15], v113, off offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: global_store_byte v[14:15], v83, off offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: global_store_byte v[14:15], v52, off offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v20 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: global_store_byte v[14:15], v68, off offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; ALIGNED-NEXT: global_store_byte v[14:15], v114, off offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: global_store_byte v[14:15], v97, off offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: global_store_byte v[14:15], v51, off offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v18 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: global_store_byte v[14:15], v84, off offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; ALIGNED-NEXT: global_store_byte v[14:15], v115, off offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v11 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 +; ALIGNED-NEXT: global_store_byte v[14:15], v69, off offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: global_store_byte v[14:15], v50, off offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:147 -; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:145 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:143 -; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:141 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:139 -; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:137 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:135 -; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:133 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:131 -; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:129 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:127 -; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:125 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:123 -; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:121 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:119 -; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:117 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:115 -; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:113 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:111 -; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:109 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:107 -; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:105 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:103 -; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:101 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:99 -; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:97 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:95 -; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:93 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:91 -; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:89 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:87 -; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:85 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:83 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:81 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:79 -; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:77 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:75 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:73 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:71 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:69 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:67 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:65 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:63 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:61 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:59 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:57 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:55 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:53 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:51 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:49 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:43 -; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:41 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:47 -; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:45 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:35 -; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:33 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:39 -; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:37 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:31 -; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:29 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:27 -; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:25 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:23 -; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:21 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:19 -; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v7, off offset:14 -; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v6, off offset:10 -; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v5, off offset:6 -; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v4, off offset:2 -; ALIGNED-NEXT: global_store_byte v[16:17], v4, off -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 +; ALIGNED-NEXT: global_store_byte v[14:15], v54, off offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: global_store_byte v[14:15], v100, off offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; ALIGNED-NEXT: global_store_byte v[14:15], v85, off offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v6 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 8, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; ALIGNED-NEXT: global_store_byte v[14:15], v81, off offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:15 -; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:13 -; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:11 -; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:9 -; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:7 -; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:5 -; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:3 -; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1 +; ALIGNED-NEXT: global_store_byte v[14:15], v33, off offset:151 +; ALIGNED-NEXT: global_store_byte v[14:15], v55, off offset:149 +; ALIGNED-NEXT: global_store_byte v[14:15], v80, off offset:147 +; ALIGNED-NEXT: global_store_byte v[14:15], v38, off offset:145 +; ALIGNED-NEXT: global_store_byte v[14:15], v101, off offset:143 +; ALIGNED-NEXT: global_store_byte v[14:15], v71, off offset:141 +; ALIGNED-NEXT: global_store_byte v[14:15], v67, off offset:139 +; ALIGNED-NEXT: global_store_byte v[14:15], v64, off offset:137 +; ALIGNED-NEXT: global_store_byte v[14:15], v32, off offset:135 +; ALIGNED-NEXT: global_store_byte v[14:15], v39, off offset:133 +; ALIGNED-NEXT: global_store_byte v[14:15], v66, off offset:131 +; ALIGNED-NEXT: global_store_byte v[14:15], v34, off offset:129 +; ALIGNED-NEXT: global_store_byte v[14:15], v102, off offset:127 +; ALIGNED-NEXT: global_store_byte v[14:15], v65, off offset:125 +; ALIGNED-NEXT: global_store_byte v[14:15], v98, off offset:123 +; ALIGNED-NEXT: global_store_byte v[14:15], v48, off offset:121 +; ALIGNED-NEXT: global_store_byte v[14:15], v27, off offset:119 +; ALIGNED-NEXT: global_store_byte v[14:15], v35, off offset:117 +; ALIGNED-NEXT: global_store_byte v[14:15], v87, off offset:115 +; ALIGNED-NEXT: global_store_byte v[14:15], v28, off offset:113 +; ALIGNED-NEXT: global_store_byte v[14:15], v103, off offset:111 +; ALIGNED-NEXT: global_store_byte v[14:15], v49, off offset:109 +; ALIGNED-NEXT: global_store_byte v[14:15], v86, off offset:107 +; ALIGNED-NEXT: global_store_byte v[14:15], v36, off offset:105 +; ALIGNED-NEXT: global_store_byte v[14:15], v26, off offset:103 +; ALIGNED-NEXT: global_store_byte v[14:15], v29, off offset:101 +; ALIGNED-NEXT: global_store_byte v[14:15], v82, off offset:99 +; ALIGNED-NEXT: global_store_byte v[14:15], v22, off offset:97 +; ALIGNED-NEXT: global_store_byte v[14:15], v112, off offset:95 +; ALIGNED-NEXT: global_store_byte v[14:15], v37, off offset:93 +; ALIGNED-NEXT: global_store_byte v[14:15], v99, off offset:91 +; ALIGNED-NEXT: global_store_byte v[14:15], v30, off offset:89 +; ALIGNED-NEXT: global_store_byte v[14:15], v53, off offset:87 +; ALIGNED-NEXT: global_store_byte v[14:15], v23, off offset:85 +; ALIGNED-NEXT: global_store_byte v[14:15], v96, off offset:83 +; ALIGNED-NEXT: global_store_byte v[14:15], v19, off offset:81 +; ALIGNED-NEXT: global_store_byte v[14:15], v113, off offset:79 +; ALIGNED-NEXT: global_store_byte v[14:15], v31, off offset:77 +; ALIGNED-NEXT: global_store_byte v[14:15], v83, off offset:75 +; ALIGNED-NEXT: global_store_byte v[14:15], v24, off offset:73 +; ALIGNED-NEXT: global_store_byte v[14:15], v52, off offset:71 +; ALIGNED-NEXT: global_store_byte v[14:15], v20, off offset:69 +; ALIGNED-NEXT: global_store_byte v[14:15], v68, off offset:67 +; ALIGNED-NEXT: global_store_byte v[14:15], v17, off offset:65 +; ALIGNED-NEXT: global_store_byte v[14:15], v114, off offset:63 +; ALIGNED-NEXT: global_store_byte v[14:15], v25, off offset:61 +; ALIGNED-NEXT: global_store_byte v[14:15], v97, off offset:59 +; ALIGNED-NEXT: global_store_byte v[14:15], v21, off offset:57 +; ALIGNED-NEXT: global_store_byte v[14:15], v51, off offset:55 +; ALIGNED-NEXT: global_store_byte v[14:15], v18, off offset:53 +; ALIGNED-NEXT: global_store_byte v[14:15], v84, off offset:51 +; ALIGNED-NEXT: global_store_byte v[14:15], v16, off offset:49 +; ALIGNED-NEXT: global_store_byte v[14:15], v115, off offset:43 +; ALIGNED-NEXT: global_store_byte v[14:15], v11, off offset:41 +; ALIGNED-NEXT: global_store_byte v[14:15], v69, off offset:47 +; ALIGNED-NEXT: global_store_byte v[14:15], v10, off offset:45 +; ALIGNED-NEXT: global_store_byte v[14:15], v50, off offset:35 +; ALIGNED-NEXT: global_store_byte v[14:15], v9, off offset:33 +; ALIGNED-NEXT: global_store_byte v[14:15], v54, off offset:39 +; ALIGNED-NEXT: global_store_byte v[14:15], v8, off offset:37 +; ALIGNED-NEXT: global_store_byte v[14:15], v100, off offset:31 +; ALIGNED-NEXT: global_store_byte v[14:15], v7, off offset:29 +; ALIGNED-NEXT: global_store_byte v[14:15], v85, off offset:27 +; ALIGNED-NEXT: global_store_byte v[14:15], v6, off offset:25 +; ALIGNED-NEXT: global_store_byte v[14:15], v81, off offset:23 +; ALIGNED-NEXT: global_store_byte v[14:15], v5, off offset:21 +; ALIGNED-NEXT: global_store_byte v[14:15], v70, off offset:19 +; ALIGNED-NEXT: global_store_byte v[14:15], v4, off offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v3, off offset:14 +; ALIGNED-NEXT: global_store_byte v[14:15], v3, off offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v2, off offset:10 +; ALIGNED-NEXT: global_store_byte v[14:15], v2, off offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v1, off offset:6 +; ALIGNED-NEXT: global_store_byte v[14:15], v1, off offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v0, off offset:2 +; ALIGNED-NEXT: global_store_byte v[14:15], v0, off +; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 8, v3 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; ALIGNED-NEXT: global_store_byte v[14:15], v4, off offset:15 +; ALIGNED-NEXT: global_store_byte v[14:15], v3, off offset:13 +; ALIGNED-NEXT: global_store_byte v[14:15], v5, off offset:11 +; ALIGNED-NEXT: global_store_byte v[14:15], v2, off offset:9 +; ALIGNED-NEXT: global_store_byte v[14:15], v6, off offset:7 +; ALIGNED-NEXT: global_store_byte v[14:15], v1, off offset:5 +; ALIGNED-NEXT: global_store_byte v[14:15], v7, off offset:3 +; ALIGNED-NEXT: global_store_byte v[14:15], v0, off offset:1 +; ALIGNED-NEXT: v_add_co_u32 v14, vcc_lo, 0xffffff00, v14 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v15, null, -1, v15, vcc_lo ; ALIGNED-NEXT: s_cbranch_scc0 .LBB6_5 -; ALIGNED-NEXT: .LBB6_6: ; %Flow8 -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: .LBB6_6: ; %Flow18 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; ALIGNED-NEXT: s_clause 0x9 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -8316,27 +8343,31 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s4 ; UNROLL3-NEXT: s_cbranch_execz .LBB6_4 ; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: v_mov_b32_e32 v5, v3 +; UNROLL3-NEXT: v_mov_b32_e32 v7, v1 +; UNROLL3-NEXT: v_mov_b32_e32 v4, v2 +; UNROLL3-NEXT: v_mov_b32_e32 v6, v0 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7e0 ; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB6_2: ; %memmove_fwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 -; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo -; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; UNROLL3-NEXT: s_clause 0x2 -; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off -; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:16 -; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32 -; UNROLL3-NEXT: s_add_u32 s4, s4, 48 -; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[4:5], off +; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:16 +; UNROLL3-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:32 +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, v4, 48 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 +; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 ; UNROLL3-NEXT: s_waitcnt vmcnt(2) -; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[4:7], off +; UNROLL3-NEXT: global_store_dwordx4 v[6:7], v[8:11], off ; UNROLL3-NEXT: s_waitcnt vmcnt(1) -; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:16 +; UNROLL3-NEXT: global_store_dwordx4 v[6:7], v[12:15], off offset:16 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:32 -; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0x7e0 +; UNROLL3-NEXT: global_store_dwordx4 v[6:7], v[16:19], off offset:32 +; UNROLL3-NEXT: v_add_co_u32 v6, vcc_lo, v6, 48 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc1 .LBB6_2 ; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual ; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2016 @@ -8345,44 +8376,47 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; UNROLL3-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:2032 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:2032 -; UNROLL3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; UNROLL3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; UNROLL3-NEXT: .LBB6_4: ; %Flow5 -; UNROLL3-NEXT: s_andn2_saveexec_b32 s8, s6 +; UNROLL3-NEXT: ; implicit-def: $vgpr2 +; UNROLL3-NEXT: ; implicit-def: $vgpr0 +; UNROLL3-NEXT: .LBB6_4: ; %Flow15 +; UNROLL3-NEXT: s_andn2_saveexec_b32 s6, s6 ; UNROLL3-NEXT: s_cbranch_execz .LBB6_7 ; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual ; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2032 -; UNROLL3-NEXT: s_movk_i32 s6, 0xffd0 -; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7b0 -; UNROLL3-NEXT: s_mov_b32 s7, -1 +; UNROLL3-NEXT: s_movk_i32 s4, 0xf820 +; UNROLL3-NEXT: s_mov_b32 s5, -1 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:2032 -; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2016 +; UNROLL3-NEXT: global_load_dwordx4 v[6:9], v[2:3], off offset:2016 +; UNROLL3-NEXT: v_add_co_u32 v2, vcc_lo, 0x7b0, v2 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0x7b0, v0 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:2016 +; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:2016 ; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB6_6: ; %memmove_bwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 -; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo -; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; UNROLL3-NEXT: s_clause 0x2 -; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off -; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:16 -; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32 -; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 -; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 +; UNROLL3-NEXT: global_load_dwordx4 v[6:9], v[2:3], off +; UNROLL3-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 +; UNROLL3-NEXT: global_load_dwordx4 v[14:17], v[2:3], off offset:32 +; UNROLL3-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffffd0, v2 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 ; UNROLL3-NEXT: s_waitcnt vmcnt(2) -; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[4:7], off +; UNROLL3-NEXT: global_store_dwordx4 v[4:5], v[6:9], off ; UNROLL3-NEXT: s_waitcnt vmcnt(1) -; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:16 +; UNROLL3-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:16 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:32 -; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; UNROLL3-NEXT: global_store_dwordx4 v[4:5], v[14:17], off offset:32 +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffffd0, v4 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, vcc_lo +; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc0 .LBB6_6 -; UNROLL3-NEXT: .LBB6_7: ; %Flow6 -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; UNROLL3-NEXT: .LBB6_7: ; %Flow16 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; UNROLL3-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 2048, i1 false) @@ -8398,1137 +8432,1144 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execz .LBB7_3 ; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0x800 ; CHECK-NEXT: .LBB7_2: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo -; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:240 -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:224 -; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:208 -; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:192 -; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:176 -; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:160 -; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:144 -; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:128 -; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:112 -; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:96 -; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:80 -; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:64 -; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:48 -; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:32 -; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off offset:16 -; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off -; CHECK-NEXT: s_add_u32 s4, s4, 0x100 -; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:240 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:224 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:208 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:192 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:176 +; CHECK-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:160 +; CHECK-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:144 +; CHECK-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:128 +; CHECK-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:112 +; CHECK-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:96 +; CHECK-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:80 +; CHECK-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:64 +; CHECK-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:48 +; CHECK-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:32 +; CHECK-NEXT: global_load_dwordx4 v[84:87], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[96:99], v[2:3], off +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: s_addc_u32 s5, s5, -1 ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:224 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[12:15] offset:208 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[20:23] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[24:27] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[28:31] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[32:35] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[36:39] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[48:51] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[52:55] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[64:67] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[68:71] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[80:83] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[84:87] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] -; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[96:99] +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB7_2 -; CHECK-NEXT: .LBB7_3: ; %Flow6 -; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6 +; CHECK-NEXT: .LBB7_3: ; %Flow16 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6 ; CHECK-NEXT: s_cbranch_execz .LBB7_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader -; CHECK-NEXT: s_movk_i32 s6, 0xff00 -; CHECK-NEXT: s_mov_b64 s[4:5], 0x700 -; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x700, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; CHECK-NEXT: s_movk_i32 s4, 0xf800 +; CHECK-NEXT: s_mov_b32 s5, -1 ; CHECK-NEXT: .LBB7_5: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo -; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:240 -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:224 -; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:208 -; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:192 -; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:176 -; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:160 -; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:144 -; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:128 -; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:112 -; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:96 -; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:80 -; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:64 -; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:48 -; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:32 -; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off offset:16 -; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off -; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 -; CHECK-NEXT: s_addc_u32 s5, s5, -1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2032 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:2016 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:2000 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:1984 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:1968 +; CHECK-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:1952 +; CHECK-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:1936 +; CHECK-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:1920 +; CHECK-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:1904 +; CHECK-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:1888 +; CHECK-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:1872 +; CHECK-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:1856 +; CHECK-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:1840 +; CHECK-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:1824 +; CHECK-NEXT: global_load_dwordx4 v[84:87], v[2:3], off offset:1808 +; CHECK-NEXT: global_load_dwordx4 v[96:99], v[2:3], off offset:1792 +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff00, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:224 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[12:15] offset:208 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[20:23] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[24:27] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[28:31] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[32:35] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[36:39] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[48:51] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[52:55] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[64:67] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[68:71] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[80:83] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[84:87] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] -; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[96:99] +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffff00, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB7_5 -; CHECK-NEXT: .LBB7_6: ; %Flow7 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: .LBB7_6: ; %Flow17 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; ; ALIGNED-LABEL: memmove_p0_p4_sz2048: ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_mov_b32 s4, exec_lo ; ALIGNED-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 ; ALIGNED-NEXT: s_cbranch_execz .LBB7_3 ; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x800 ; ALIGNED-NEXT: .LBB7_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 -; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 -; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[4:5], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[4:5], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[4:5], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[4:5], off offset:112 -; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[4:5], off offset:96 -; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[4:5], off offset:80 -; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[4:5], off offset:64 -; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:48 -; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 -; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 -; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off -; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 -; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[2:3], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[2:3], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:32 +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16 +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[2:3], off +; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 +; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v114 offset:250 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v115 offset:254 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:252 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:248 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v113 offset:246 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v112 offset:242 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:240 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v98 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v98 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v99 offset:254 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v99 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:252 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:248 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v97 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v97 offset:246 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v97 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:244 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v96 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v96 offset:242 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:240 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:251 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:253 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:249 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:247 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:255 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:243 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:247 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v82 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v82 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v83 +; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v86 offset:234 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v87 offset:238 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:236 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v85 offset:230 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v84 offset:226 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:235 ; ALIGNED-NEXT: s_waitcnt vmcnt(12) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v69 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v70 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v68 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v82 offset:218 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v83 offset:222 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:220 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:216 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v81 offset:214 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v80 offset:210 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:208 ; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v64 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:215 ; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v54 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v55 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v70 offset:202 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v71 offset:206 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:204 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:200 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v69 offset:198 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v68 offset:194 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:192 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:203 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v50 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v49 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v66 offset:186 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v67 offset:190 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:188 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:184 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v65 offset:182 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v64 offset:178 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:176 ; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v39 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v37 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v36 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:183 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v34 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v35 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v54 offset:170 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:168 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v55 offset:174 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:172 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v52 offset:162 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:160 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v53 offset:166 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:164 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:171 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v29 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v30 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v31 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v29 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v28 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v50 offset:154 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v51 offset:158 +; ALIGNED-NEXT: flat_store_byte v[0:1], v51 offset:156 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:152 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v49 offset:150 +; ALIGNED-NEXT: flat_store_byte v[0:1], v49 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v48 offset:146 +; ALIGNED-NEXT: flat_store_byte v[0:1], v48 offset:144 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:155 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:153 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:159 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:157 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:155 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:153 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:159 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:157 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:151 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:133 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:127 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:125 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:119 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:117 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v22 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:149 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:147 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:145 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v38 offset:138 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v39 offset:142 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:140 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:136 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v37 offset:134 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v36 offset:130 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:139 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v18 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:137 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:143 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:141 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:135 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:133 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:131 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:129 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v34 offset:122 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v35 offset:126 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:124 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:120 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v33 offset:118 +; ALIGNED-NEXT: flat_store_byte v[0:1], v33 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v32 offset:114 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:123 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:121 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:127 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:125 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:119 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:111 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:109 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v6 -; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:103 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:107 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:105 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v7 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v4 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:95 -; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:93 -; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:28 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 -; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 -; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:63 -; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:61 -; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 -; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 -; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:39 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:35 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 -; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:19 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:17 -; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:9 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:15 -; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:13 -; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:3 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:1 -; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:117 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:115 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:113 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[0:1], v28 offset:96 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:107 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:105 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:111 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:109 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v7 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v5 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v5 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v4 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v4 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v26 offset:90 +; ALIGNED-NEXT: flat_store_byte v[0:1], v48 offset:91 +; ALIGNED-NEXT: flat_store_byte v[0:1], v49 offset:89 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v27 offset:94 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:95 +; ALIGNED-NEXT: flat_store_byte v[0:1], v27 offset:92 +; ALIGNED-NEXT: flat_store_byte v[0:1], v51 offset:93 +; ALIGNED-NEXT: flat_store_byte v[0:1], v26 offset:88 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:87 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v25 offset:86 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:85 +; ALIGNED-NEXT: flat_store_byte v[0:1], v25 offset:84 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:83 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v24 offset:82 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:81 +; ALIGNED-NEXT: flat_store_byte v[0:1], v24 offset:80 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v22 offset:74 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:75 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:73 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v23 offset:78 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:79 +; ALIGNED-NEXT: flat_store_byte v[0:1], v23 offset:76 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:77 +; ALIGNED-NEXT: flat_store_byte v[0:1], v22 offset:72 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:71 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v21 offset:70 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:69 +; ALIGNED-NEXT: flat_store_byte v[0:1], v21 offset:68 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:67 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v20 offset:66 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:65 +; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:64 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v18 offset:58 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:59 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:57 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v19 offset:62 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:63 +; ALIGNED-NEXT: flat_store_byte v[0:1], v19 offset:60 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:61 +; ALIGNED-NEXT: flat_store_byte v[0:1], v18 offset:56 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:55 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v17 offset:54 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:53 +; ALIGNED-NEXT: flat_store_byte v[0:1], v17 offset:52 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:51 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v16 offset:50 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:49 +; ALIGNED-NEXT: flat_store_byte v[0:1], v16 offset:48 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v14 offset:42 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:43 +; ALIGNED-NEXT: flat_store_byte v[0:1], v33 offset:41 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v15 offset:46 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:47 +; ALIGNED-NEXT: flat_store_byte v[0:1], v15 offset:44 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:45 +; ALIGNED-NEXT: flat_store_byte v[0:1], v14 offset:40 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:39 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v13 offset:38 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:37 +; ALIGNED-NEXT: flat_store_byte v[0:1], v13 offset:36 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:35 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v12 offset:34 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:33 +; ALIGNED-NEXT: flat_store_byte v[0:1], v12 offset:32 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:26 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:27 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:25 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v11 offset:30 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:31 +; ALIGNED-NEXT: flat_store_byte v[0:1], v11 offset:28 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:29 +; ALIGNED-NEXT: flat_store_byte v[0:1], v10 offset:24 +; ALIGNED-NEXT: flat_store_byte v[0:1], v28 offset:23 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:22 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:21 +; ALIGNED-NEXT: flat_store_byte v[0:1], v9 offset:20 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:19 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:18 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:17 +; ALIGNED-NEXT: flat_store_byte v[0:1], v8 offset:16 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v6 offset:10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:11 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:9 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v7 offset:14 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:15 +; ALIGNED-NEXT: flat_store_byte v[0:1], v7 offset:12 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:13 +; ALIGNED-NEXT: flat_store_byte v[0:1], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:7 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v5 offset:6 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:5 +; ALIGNED-NEXT: flat_store_byte v[0:1], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:3 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:1 +; ALIGNED-NEXT: flat_store_byte v[0:1], v4 +; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB7_2 -; ALIGNED-NEXT: .LBB7_3: ; %Flow6 -; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 +; ALIGNED-NEXT: .LBB7_3: ; %Flow16 +; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6 ; ALIGNED-NEXT: s_cbranch_execz .LBB7_6 ; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader -; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 -; ALIGNED-NEXT: s_mov_b32 s7, -1 +; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0x700, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; ALIGNED-NEXT: s_movk_i32 s4, 0xf800 +; ALIGNED-NEXT: s_mov_b32 s5, -1 ; ALIGNED-NEXT: .LBB7_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 -; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 -; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[4:5], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[4:5], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[4:5], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[4:5], off offset:112 -; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[4:5], off offset:96 -; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[4:5], off offset:80 -; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[4:5], off offset:64 -; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:48 -; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 -; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 -; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off -; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 -; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 +; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[2:3], off offset:2032 +; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[2:3], off offset:2016 +; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:2000 +; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:1984 +; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:1968 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:1952 +; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:1936 +; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:1920 +; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:1904 +; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:1888 +; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:1872 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:1856 +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:1840 +; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:1824 +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:1808 +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:1792 +; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff00, v2 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:250 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:254 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:252 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:248 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:246 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:242 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:240 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v98 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v98 offset:250 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v99 offset:254 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:252 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:248 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v97 offset:246 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v96 offset:242 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v98 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:243 -; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v82 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:239 -; ALIGNED-NEXT: s_waitcnt vmcnt(12) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v69 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 -; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:207 -; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 -; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v85 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:247 +; ALIGNED-NEXT: s_waitcnt vmcnt(13) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v82 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v83 +; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v86 offset:234 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v87 offset:238 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:236 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v85 offset:230 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v84 offset:226 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:235 +; ALIGNED-NEXT: s_waitcnt vmcnt(12) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v70 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v68 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v82 offset:218 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v83 offset:222 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:220 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:216 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v81 offset:214 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v80 offset:210 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v64 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:215 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v54 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v55 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v70 offset:202 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v71 offset:206 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:204 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:200 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v69 offset:198 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v68 offset:194 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:192 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:203 +; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v50 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v49 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v66 offset:186 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v67 offset:190 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:188 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:184 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v65 offset:182 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v64 offset:178 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:176 +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 8, v39 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v37 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v36 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:183 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v34 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v35 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v54 offset:170 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v55 offset:174 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:172 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:168 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v53 offset:166 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:164 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v52 offset:162 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v40, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v41, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v42, 8, v32 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:167 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v29 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 -; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:155 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:153 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:159 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:157 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:476 -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:135 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:133 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:127 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:125 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:119 -; ALIGNED-NEXT: flat_store_byte v[96:97], v116 offset:117 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:109 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v6 -; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:107 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:105 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v7 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 -; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 -; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v4 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:95 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:93 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 -; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:364 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:63 -; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:61 -; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 -; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 -; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:39 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:35 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 -; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:19 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:17 -; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:348 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:9 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:15 -; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:13 -; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:3 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:1 -; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v31 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:171 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:169 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:175 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v29 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v28 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v50 offset:154 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v51 offset:158 +; ALIGNED-NEXT: flat_store_byte v[0:1], v51 offset:156 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:152 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v49 offset:150 +; ALIGNED-NEXT: flat_store_byte v[0:1], v49 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v48 offset:146 +; ALIGNED-NEXT: flat_store_byte v[0:1], v48 offset:144 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:155 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:153 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:159 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:157 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:151 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:149 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:147 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:145 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v38 offset:138 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v39 offset:142 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:140 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:136 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v37 offset:134 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v36 offset:130 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:139 +; ALIGNED-NEXT: flat_store_byte v[0:1], v116 offset:137 +; ALIGNED-NEXT: flat_store_byte v[0:1], v117 offset:143 +; ALIGNED-NEXT: flat_store_byte v[0:1], v118 offset:141 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:135 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:133 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:131 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:129 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v34 offset:122 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v35 offset:126 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:124 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:120 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v33 offset:118 +; ALIGNED-NEXT: flat_store_byte v[0:1], v33 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v32 offset:114 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v18 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:123 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:121 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:127 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:125 +; ALIGNED-NEXT: flat_store_byte v[0:1], v119 offset:119 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v40 offset:117 +; ALIGNED-NEXT: flat_store_byte v[0:1], v41 offset:115 +; ALIGNED-NEXT: flat_store_byte v[0:1], v42 offset:113 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[0:1], v28 offset:96 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:107 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:105 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:111 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:109 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v7 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v5 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v5 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v4 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v4 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v26 offset:90 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v27 offset:94 +; ALIGNED-NEXT: flat_store_byte v[0:1], v27 offset:92 +; ALIGNED-NEXT: flat_store_byte v[0:1], v26 offset:88 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v25 offset:86 +; ALIGNED-NEXT: flat_store_byte v[0:1], v25 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v24 offset:82 +; ALIGNED-NEXT: flat_store_byte v[0:1], v24 offset:80 +; ALIGNED-NEXT: flat_store_byte v[0:1], v48 offset:91 +; ALIGNED-NEXT: flat_store_byte v[0:1], v49 offset:89 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:95 +; ALIGNED-NEXT: flat_store_byte v[0:1], v51 offset:93 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:87 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:85 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:83 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:81 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v22 offset:74 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v23 offset:78 +; ALIGNED-NEXT: flat_store_byte v[0:1], v23 offset:76 +; ALIGNED-NEXT: flat_store_byte v[0:1], v22 offset:72 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v21 offset:70 +; ALIGNED-NEXT: flat_store_byte v[0:1], v21 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v20 offset:66 +; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:64 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:75 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:73 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:79 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:77 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:71 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:69 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:67 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:65 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v18 offset:58 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:59 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:57 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v19 offset:62 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:63 +; ALIGNED-NEXT: flat_store_byte v[0:1], v19 offset:60 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:61 +; ALIGNED-NEXT: flat_store_byte v[0:1], v18 offset:56 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:55 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v17 offset:54 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:53 +; ALIGNED-NEXT: flat_store_byte v[0:1], v17 offset:52 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:51 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v16 offset:50 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:49 +; ALIGNED-NEXT: flat_store_byte v[0:1], v16 offset:48 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:43 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v14 offset:42 +; ALIGNED-NEXT: flat_store_byte v[0:1], v33 offset:41 +; ALIGNED-NEXT: flat_store_byte v[0:1], v14 offset:40 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:47 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v15 offset:46 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:45 +; ALIGNED-NEXT: flat_store_byte v[0:1], v15 offset:44 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:35 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v12 offset:34 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:33 +; ALIGNED-NEXT: flat_store_byte v[0:1], v12 offset:32 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:39 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v13 offset:38 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:37 +; ALIGNED-NEXT: flat_store_byte v[0:1], v13 offset:36 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:26 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:27 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:25 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v11 offset:30 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:31 +; ALIGNED-NEXT: flat_store_byte v[0:1], v11 offset:28 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:29 +; ALIGNED-NEXT: flat_store_byte v[0:1], v10 offset:24 +; ALIGNED-NEXT: flat_store_byte v[0:1], v28 offset:23 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:22 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:21 +; ALIGNED-NEXT: flat_store_byte v[0:1], v9 offset:20 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:19 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:18 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:17 +; ALIGNED-NEXT: flat_store_byte v[0:1], v8 offset:16 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v6 offset:10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:11 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:9 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v7 offset:14 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:15 +; ALIGNED-NEXT: flat_store_byte v[0:1], v7 offset:12 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:13 +; ALIGNED-NEXT: flat_store_byte v[0:1], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:7 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v5 offset:6 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:5 +; ALIGNED-NEXT: flat_store_byte v[0:1], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:3 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:1 +; ALIGNED-NEXT: flat_store_byte v[0:1], v4 +; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffff00, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB7_5 -; ALIGNED-NEXT: .LBB7_6: ; %Flow7 -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-NEXT: .LBB7_6: ; %Flow17 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; UNROLL3-LABEL: memmove_p0_p4_sz2048: @@ -9539,27 +9580,31 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s4 ; UNROLL3-NEXT: s_cbranch_execz .LBB7_4 ; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: v_mov_b32_e32 v5, v3 +; UNROLL3-NEXT: v_mov_b32_e32 v7, v1 +; UNROLL3-NEXT: v_mov_b32_e32 v4, v2 +; UNROLL3-NEXT: v_mov_b32_e32 v6, v0 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7e0 ; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB7_2: ; %memmove_fwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 -; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo -; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; UNROLL3-NEXT: s_clause 0x2 -; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16 -; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off -; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32 -; UNROLL3-NEXT: s_add_u32 s4, s4, 48 -; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 +; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[4:5], off +; UNROLL3-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:32 +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, v4, 48 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 +; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 ; UNROLL3-NEXT: s_waitcnt vmcnt(2) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[8:11] offset:16 ; UNROLL3-NEXT: s_waitcnt vmcnt(1) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[12:15] ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 -; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0x7e0 +; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[16:19] offset:32 +; UNROLL3-NEXT: v_add_co_u32 v6, vcc_lo, v6, 48 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc1 .LBB7_2 ; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual ; UNROLL3-NEXT: s_clause 0x1 @@ -9570,44 +9615,45 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2016 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:2032 -; UNROLL3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; UNROLL3-NEXT: .LBB7_4: ; %Flow4 -; UNROLL3-NEXT: s_andn2_saveexec_b32 s8, s6 +; UNROLL3-NEXT: ; implicit-def: $vgpr0 +; UNROLL3-NEXT: .LBB7_4: ; %Flow14 +; UNROLL3-NEXT: s_andn2_saveexec_b32 s6, s6 ; UNROLL3-NEXT: s_cbranch_execz .LBB7_7 ; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual ; UNROLL3-NEXT: s_clause 0x1 -; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2032 -; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:2016 -; UNROLL3-NEXT: s_movk_i32 s6, 0xffd0 -; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7b0 -; UNROLL3-NEXT: s_mov_b32 s7, -1 +; UNROLL3-NEXT: global_load_dwordx4 v[6:9], v[2:3], off offset:2032 +; UNROLL3-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:2016 +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0x7b0, v0 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo +; UNROLL3-NEXT: s_movk_i32 s4, 0xf820 +; UNROLL3-NEXT: s_mov_b32 s5, -1 ; UNROLL3-NEXT: s_waitcnt vmcnt(1) -; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2032 +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:2032 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:2016 +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[10:13] offset:2016 ; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB7_6: ; %memmove_bwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 -; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo -; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; UNROLL3-NEXT: s_clause 0x2 -; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16 -; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off -; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32 -; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 -; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 +; UNROLL3-NEXT: global_load_dwordx4 v[6:9], v[2:3], off offset:1984 +; UNROLL3-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:1968 +; UNROLL3-NEXT: global_load_dwordx4 v[14:17], v[2:3], off offset:2000 +; UNROLL3-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffffd0, v2 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 ; UNROLL3-NEXT: s_waitcnt vmcnt(2) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[6:9] offset:16 ; UNROLL3-NEXT: s_waitcnt vmcnt(1) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[10:13] ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 -; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[14:17] offset:32 +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffffd0, v4 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, vcc_lo +; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc0 .LBB7_6 -; UNROLL3-NEXT: .LBB7_7: ; %Flow5 -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; UNROLL3-NEXT: .LBB7_7: ; %Flow15 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) ; UNROLL3-NEXT: s_setpc_b64 s[30:31] entry: @@ -9829,207 +9875,205 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6 ; CHECK-NEXT: s_cbranch_execz .LBB8_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader -; CHECK-NEXT: v_add_nc_u32_e32 v1, 0x700, v1 -; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x700, v0 ; CHECK-NEXT: s_movk_i32 s4, 0xf800 ; CHECK-NEXT: s_mov_b32 s5, -1 ; CHECK-NEXT: .LBB8_5: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e -; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:252 -; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:248 -; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:244 -; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:240 -; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:236 -; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:232 -; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:228 -; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:224 -; CHECK-NEXT: buffer_load_dword v10, v1, s[0:3], 0 offen offset:220 -; CHECK-NEXT: buffer_load_dword v11, v1, s[0:3], 0 offen offset:216 -; CHECK-NEXT: buffer_load_dword v12, v1, s[0:3], 0 offen offset:212 -; CHECK-NEXT: buffer_load_dword v13, v1, s[0:3], 0 offen offset:208 -; CHECK-NEXT: buffer_load_dword v14, v1, s[0:3], 0 offen offset:204 -; CHECK-NEXT: buffer_load_dword v15, v1, s[0:3], 0 offen offset:200 -; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen offset:196 -; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:192 -; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:188 -; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:184 -; CHECK-NEXT: buffer_load_dword v20, v1, s[0:3], 0 offen offset:180 -; CHECK-NEXT: buffer_load_dword v21, v1, s[0:3], 0 offen offset:176 -; CHECK-NEXT: buffer_load_dword v22, v1, s[0:3], 0 offen offset:172 -; CHECK-NEXT: buffer_load_dword v23, v1, s[0:3], 0 offen offset:168 -; CHECK-NEXT: buffer_load_dword v24, v1, s[0:3], 0 offen offset:164 -; CHECK-NEXT: buffer_load_dword v25, v1, s[0:3], 0 offen offset:160 -; CHECK-NEXT: buffer_load_dword v26, v1, s[0:3], 0 offen offset:156 -; CHECK-NEXT: buffer_load_dword v27, v1, s[0:3], 0 offen offset:152 -; CHECK-NEXT: buffer_load_dword v28, v1, s[0:3], 0 offen offset:148 -; CHECK-NEXT: buffer_load_dword v29, v1, s[0:3], 0 offen offset:144 -; CHECK-NEXT: buffer_load_dword v30, v1, s[0:3], 0 offen offset:140 -; CHECK-NEXT: buffer_load_dword v31, v1, s[0:3], 0 offen offset:136 -; CHECK-NEXT: buffer_load_dword v32, v1, s[0:3], 0 offen offset:132 -; CHECK-NEXT: buffer_load_dword v33, v1, s[0:3], 0 offen offset:128 -; CHECK-NEXT: buffer_load_dword v34, v1, s[0:3], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v35, v1, s[0:3], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v36, v1, s[0:3], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v37, v1, s[0:3], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v38, v1, s[0:3], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v39, v1, s[0:3], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v48, v1, s[0:3], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v49, v1, s[0:3], 0 offen offset:96 -; CHECK-NEXT: buffer_load_dword v50, v1, s[0:3], 0 offen offset:92 -; CHECK-NEXT: buffer_load_dword v51, v1, s[0:3], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v52, v1, s[0:3], 0 offen offset:84 -; CHECK-NEXT: buffer_load_dword v53, v1, s[0:3], 0 offen offset:80 -; CHECK-NEXT: buffer_load_dword v54, v1, s[0:3], 0 offen offset:76 -; CHECK-NEXT: buffer_load_dword v55, v1, s[0:3], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v64, v1, s[0:3], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v65, v1, s[0:3], 0 offen offset:64 -; CHECK-NEXT: buffer_load_dword v66, v1, s[0:3], 0 offen offset:60 -; CHECK-NEXT: buffer_load_dword v67, v1, s[0:3], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v68, v1, s[0:3], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v69, v1, s[0:3], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v70, v1, s[0:3], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v71, v1, s[0:3], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v80, v1, s[0:3], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v81, v1, s[0:3], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v82, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v83, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v84, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v85, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v86, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_dword v87, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v96, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v97, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:2044 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2040 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2036 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2032 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:2028 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:2024 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:2020 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:2016 +; CHECK-NEXT: buffer_load_dword v10, v1, s[0:3], 0 offen offset:2012 +; CHECK-NEXT: buffer_load_dword v11, v1, s[0:3], 0 offen offset:2008 +; CHECK-NEXT: buffer_load_dword v12, v1, s[0:3], 0 offen offset:2004 +; CHECK-NEXT: buffer_load_dword v13, v1, s[0:3], 0 offen offset:2000 +; CHECK-NEXT: buffer_load_dword v14, v1, s[0:3], 0 offen offset:1996 +; CHECK-NEXT: buffer_load_dword v15, v1, s[0:3], 0 offen offset:1992 +; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen offset:1988 +; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:1984 +; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:1980 +; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:1976 +; CHECK-NEXT: buffer_load_dword v20, v1, s[0:3], 0 offen offset:1972 +; CHECK-NEXT: buffer_load_dword v21, v1, s[0:3], 0 offen offset:1968 +; CHECK-NEXT: buffer_load_dword v22, v1, s[0:3], 0 offen offset:1964 +; CHECK-NEXT: buffer_load_dword v23, v1, s[0:3], 0 offen offset:1960 +; CHECK-NEXT: buffer_load_dword v24, v1, s[0:3], 0 offen offset:1956 +; CHECK-NEXT: buffer_load_dword v25, v1, s[0:3], 0 offen offset:1952 +; CHECK-NEXT: buffer_load_dword v26, v1, s[0:3], 0 offen offset:1948 +; CHECK-NEXT: buffer_load_dword v27, v1, s[0:3], 0 offen offset:1944 +; CHECK-NEXT: buffer_load_dword v28, v1, s[0:3], 0 offen offset:1940 +; CHECK-NEXT: buffer_load_dword v29, v1, s[0:3], 0 offen offset:1936 +; CHECK-NEXT: buffer_load_dword v30, v1, s[0:3], 0 offen offset:1932 +; CHECK-NEXT: buffer_load_dword v31, v1, s[0:3], 0 offen offset:1928 +; CHECK-NEXT: buffer_load_dword v32, v1, s[0:3], 0 offen offset:1924 +; CHECK-NEXT: buffer_load_dword v33, v1, s[0:3], 0 offen offset:1920 +; CHECK-NEXT: buffer_load_dword v34, v1, s[0:3], 0 offen offset:1916 +; CHECK-NEXT: buffer_load_dword v35, v1, s[0:3], 0 offen offset:1912 +; CHECK-NEXT: buffer_load_dword v36, v1, s[0:3], 0 offen offset:1908 +; CHECK-NEXT: buffer_load_dword v37, v1, s[0:3], 0 offen offset:1904 +; CHECK-NEXT: buffer_load_dword v38, v1, s[0:3], 0 offen offset:1900 +; CHECK-NEXT: buffer_load_dword v39, v1, s[0:3], 0 offen offset:1896 +; CHECK-NEXT: buffer_load_dword v48, v1, s[0:3], 0 offen offset:1892 +; CHECK-NEXT: buffer_load_dword v49, v1, s[0:3], 0 offen offset:1888 +; CHECK-NEXT: buffer_load_dword v50, v1, s[0:3], 0 offen offset:1884 +; CHECK-NEXT: buffer_load_dword v51, v1, s[0:3], 0 offen offset:1880 +; CHECK-NEXT: buffer_load_dword v52, v1, s[0:3], 0 offen offset:1876 +; CHECK-NEXT: buffer_load_dword v53, v1, s[0:3], 0 offen offset:1872 +; CHECK-NEXT: buffer_load_dword v54, v1, s[0:3], 0 offen offset:1868 +; CHECK-NEXT: buffer_load_dword v55, v1, s[0:3], 0 offen offset:1864 +; CHECK-NEXT: buffer_load_dword v64, v1, s[0:3], 0 offen offset:1860 +; CHECK-NEXT: buffer_load_dword v65, v1, s[0:3], 0 offen offset:1856 +; CHECK-NEXT: buffer_load_dword v66, v1, s[0:3], 0 offen offset:1852 +; CHECK-NEXT: buffer_load_dword v67, v1, s[0:3], 0 offen offset:1848 +; CHECK-NEXT: buffer_load_dword v68, v1, s[0:3], 0 offen offset:1844 +; CHECK-NEXT: buffer_load_dword v69, v1, s[0:3], 0 offen offset:1840 +; CHECK-NEXT: buffer_load_dword v70, v1, s[0:3], 0 offen offset:1836 +; CHECK-NEXT: buffer_load_dword v71, v1, s[0:3], 0 offen offset:1832 +; CHECK-NEXT: buffer_load_dword v80, v1, s[0:3], 0 offen offset:1828 +; CHECK-NEXT: buffer_load_dword v81, v1, s[0:3], 0 offen offset:1824 +; CHECK-NEXT: buffer_load_dword v82, v1, s[0:3], 0 offen offset:1820 +; CHECK-NEXT: buffer_load_dword v83, v1, s[0:3], 0 offen offset:1816 +; CHECK-NEXT: buffer_load_dword v84, v1, s[0:3], 0 offen offset:1812 +; CHECK-NEXT: buffer_load_dword v85, v1, s[0:3], 0 offen offset:1808 +; CHECK-NEXT: buffer_load_dword v86, v1, s[0:3], 0 offen offset:1804 +; CHECK-NEXT: buffer_load_dword v87, v1, s[0:3], 0 offen offset:1800 +; CHECK-NEXT: buffer_load_dword v96, v1, s[0:3], 0 offen offset:1796 +; CHECK-NEXT: buffer_load_dword v97, v1, s[0:3], 0 offen offset:1792 ; CHECK-NEXT: v_add_nc_u32_e32 v1, 0xffffff00, v1 ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: s_waitcnt vmcnt(62) -; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252 -; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:248 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:2044 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2040 ; CHECK-NEXT: s_waitcnt vmcnt(61) -; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:244 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2036 ; CHECK-NEXT: s_waitcnt vmcnt(60) -; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:240 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2032 ; CHECK-NEXT: s_waitcnt vmcnt(59) -; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:236 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:2028 ; CHECK-NEXT: s_waitcnt vmcnt(58) -; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:232 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:2024 ; CHECK-NEXT: s_waitcnt vmcnt(57) -; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:228 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:2020 ; CHECK-NEXT: s_waitcnt vmcnt(56) -; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:2016 ; CHECK-NEXT: s_waitcnt vmcnt(55) -; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:220 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:2012 ; CHECK-NEXT: s_waitcnt vmcnt(54) -; CHECK-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 +; CHECK-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:2008 ; CHECK-NEXT: s_waitcnt vmcnt(53) -; CHECK-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:212 +; CHECK-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:2004 ; CHECK-NEXT: s_waitcnt vmcnt(52) -; CHECK-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:208 +; CHECK-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:2000 ; CHECK-NEXT: s_waitcnt vmcnt(51) -; CHECK-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204 +; CHECK-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:1996 ; CHECK-NEXT: s_waitcnt vmcnt(50) -; CHECK-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:200 +; CHECK-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:1992 ; CHECK-NEXT: s_waitcnt vmcnt(49) -; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:196 +; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:1988 ; CHECK-NEXT: s_waitcnt vmcnt(48) -; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:192 +; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:1984 ; CHECK-NEXT: s_waitcnt vmcnt(47) -; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:188 +; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:1980 ; CHECK-NEXT: s_waitcnt vmcnt(46) -; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:184 +; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:1976 ; CHECK-NEXT: s_waitcnt vmcnt(45) -; CHECK-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:180 +; CHECK-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:1972 ; CHECK-NEXT: s_waitcnt vmcnt(44) -; CHECK-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:1968 ; CHECK-NEXT: s_waitcnt vmcnt(43) -; CHECK-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:172 +; CHECK-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:1964 ; CHECK-NEXT: s_waitcnt vmcnt(42) -; CHECK-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:168 +; CHECK-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:1960 ; CHECK-NEXT: s_waitcnt vmcnt(41) -; CHECK-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:164 +; CHECK-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:1956 ; CHECK-NEXT: s_waitcnt vmcnt(40) -; CHECK-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:160 +; CHECK-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:1952 ; CHECK-NEXT: s_waitcnt vmcnt(39) -; CHECK-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:156 +; CHECK-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:1948 ; CHECK-NEXT: s_waitcnt vmcnt(38) -; CHECK-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:152 +; CHECK-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:1944 ; CHECK-NEXT: s_waitcnt vmcnt(37) -; CHECK-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:148 +; CHECK-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:1940 ; CHECK-NEXT: s_waitcnt vmcnt(36) -; CHECK-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:1936 ; CHECK-NEXT: s_waitcnt vmcnt(35) -; CHECK-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:140 +; CHECK-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:1932 ; CHECK-NEXT: s_waitcnt vmcnt(34) -; CHECK-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:136 +; CHECK-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:1928 ; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:132 +; CHECK-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:1924 ; CHECK-NEXT: s_waitcnt vmcnt(32) -; CHECK-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; CHECK-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:1920 ; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:1916 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:1912 ; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:1908 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:1904 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:1900 ; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:1896 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:1892 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:96 +; CHECK-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:1888 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:92 +; CHECK-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:1884 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:88 +; CHECK-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:1880 ; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:84 +; CHECK-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:1876 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:80 +; CHECK-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:1872 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:76 +; CHECK-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:1868 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:72 +; CHECK-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:1864 ; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_dword v64, v0, s[0:3], 0 offen offset:68 +; CHECK-NEXT: buffer_store_dword v64, v0, s[0:3], 0 offen offset:1860 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_dword v65, v0, s[0:3], 0 offen offset:64 +; CHECK-NEXT: buffer_store_dword v65, v0, s[0:3], 0 offen offset:1856 ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_dword v66, v0, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v66, v0, s[0:3], 0 offen offset:1852 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_dword v67, v0, s[0:3], 0 offen offset:56 +; CHECK-NEXT: buffer_store_dword v67, v0, s[0:3], 0 offen offset:1848 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_dword v68, v0, s[0:3], 0 offen offset:52 +; CHECK-NEXT: buffer_store_dword v68, v0, s[0:3], 0 offen offset:1844 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_dword v69, v0, s[0:3], 0 offen offset:48 +; CHECK-NEXT: buffer_store_dword v69, v0, s[0:3], 0 offen offset:1840 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_dword v70, v0, s[0:3], 0 offen offset:44 +; CHECK-NEXT: buffer_store_dword v70, v0, s[0:3], 0 offen offset:1836 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_dword v71, v0, s[0:3], 0 offen offset:40 +; CHECK-NEXT: buffer_store_dword v71, v0, s[0:3], 0 offen offset:1832 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_dword v80, v0, s[0:3], 0 offen offset:36 +; CHECK-NEXT: buffer_store_dword v80, v0, s[0:3], 0 offen offset:1828 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_dword v81, v0, s[0:3], 0 offen offset:32 +; CHECK-NEXT: buffer_store_dword v81, v0, s[0:3], 0 offen offset:1824 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_dword v82, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v82, v0, s[0:3], 0 offen offset:1820 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_dword v83, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v83, v0, s[0:3], 0 offen offset:1816 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v84, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v84, v0, s[0:3], 0 offen offset:1812 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_dword v85, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v85, v0, s[0:3], 0 offen offset:1808 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_dword v86, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v86, v0, s[0:3], 0 offen offset:1804 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_dword v87, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v87, v0, s[0:3], 0 offen offset:1800 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dword v96, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v96, v0, s[0:3], 0 offen offset:1796 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v97, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v97, v0, s[0:3], 0 offen offset:1792 ; CHECK-NEXT: v_add_nc_u32_e32 v0, 0xffffff00, v0 ; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB8_5 @@ -11145,1055 +11189,1053 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6 ; ALIGNED-NEXT: s_cbranch_execz .LBB8_6 ; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader -; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0x700, v1 -; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x700, v0 ; ALIGNED-NEXT: s_movk_i32 s4, 0xf800 ; ALIGNED-NEXT: s_mov_b32 s5, -1 ; ALIGNED-NEXT: .LBB8_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2047 ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2046 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2045 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2044 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2043 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2042 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2041 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2040 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2039 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2038 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2037 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2036 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2035 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2034 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2033 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2032 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2031 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2030 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2029 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2028 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2027 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2026 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2025 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2024 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2023 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2022 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2021 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2020 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2019 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2018 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2017 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2016 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2015 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2014 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2013 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2012 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2011 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2010 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2009 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2008 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2007 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2006 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2005 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2004 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2003 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2002 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2001 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2000 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1999 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1998 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1997 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1996 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1995 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1994 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1993 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1992 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:1991 +; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:1990 +; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:1989 +; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:1988 +; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:1987 +; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:1986 +; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:1985 +; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:1984 +; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:1983 +; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:1982 +; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:1981 +; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:1980 +; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:1979 +; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:1978 +; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:1977 +; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:1976 +; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:1975 +; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:1974 +; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:1973 +; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:1972 +; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:1971 +; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:1970 +; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:1969 +; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:1968 +; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:1967 +; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:1966 +; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:1965 +; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:1964 +; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:1963 +; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:1962 +; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:1961 +; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:1960 +; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:1959 +; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:1958 +; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:1957 +; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:1956 +; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:1955 +; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:1954 +; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:1953 +; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:1952 +; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:1951 +; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:1950 +; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:1949 +; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:1948 +; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:1947 +; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:1946 +; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:1945 +; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:1944 +; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:1943 +; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:1942 +; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:1941 +; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:1940 +; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:1939 +; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:1938 +; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:1937 +; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:1936 +; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:1935 +; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:1934 +; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:1933 +; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1932 +; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:1931 +; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:1930 +; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:1929 ; ALIGNED-NEXT: s_clause 0xa -; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:128 -; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:127 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:1928 +; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:1927 +; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:1926 +; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:1925 +; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:1924 +; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:1923 +; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:1922 +; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:1921 +; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:1920 +; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1919 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1918 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x34 -; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:124 -; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:123 -; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:122 -; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:121 -; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:120 -; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:119 -; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:118 -; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:117 -; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:116 -; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:115 -; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:114 -; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:113 -; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:112 -; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:111 -; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:110 -; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:109 -; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:108 -; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:107 -; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:106 -; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:105 -; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:104 -; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:103 -; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:102 -; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:101 -; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:100 -; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:99 -; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:98 -; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:97 -; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:96 -; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:95 -; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:94 -; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:93 -; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:92 -; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:91 -; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:90 -; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:89 -; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:88 -; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:84 -; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:83 -; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:81 -; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:80 -; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:78 -; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:76 -; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:74 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:1917 +; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:1916 +; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:1915 +; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:1914 +; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:1913 +; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:1912 +; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:1911 +; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:1910 +; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:1909 +; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:1908 +; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:1907 +; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:1906 +; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:1905 +; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:1904 +; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:1903 +; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:1902 +; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:1901 +; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:1900 +; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:1899 +; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:1898 +; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:1897 +; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:1896 +; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:1895 +; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:1894 +; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:1893 +; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:1892 +; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:1891 +; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:1890 +; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:1889 +; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:1888 +; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:1887 +; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:1886 +; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:1885 +; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:1884 +; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:1883 +; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:1882 +; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:1881 +; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:1880 +; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:1879 +; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:1878 +; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:1877 +; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:1876 +; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:1875 +; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:1874 +; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:1873 +; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:1872 +; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:1871 +; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:1870 +; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:1869 +; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:1868 +; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:1867 +; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:1866 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1865 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1864 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1863 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1862 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1861 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1860 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1859 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1858 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1857 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1856 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1855 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1854 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1853 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1852 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1851 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1850 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1849 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1848 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1847 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1846 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1845 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1844 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1843 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1842 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1841 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1840 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1839 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1838 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1837 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1836 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1835 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1834 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1833 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1832 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1831 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1830 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1829 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1828 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1827 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1826 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1825 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1824 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1823 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1822 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1821 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1820 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1819 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1818 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1817 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1816 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1815 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1814 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1813 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1812 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1811 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1810 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1809 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1808 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1807 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1806 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1805 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1804 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1803 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1802 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1801 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1800 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1799 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1798 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1797 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1796 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1795 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1794 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1793 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1792 ; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0xffffff00, v1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2047 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2046 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2045 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2044 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2043 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2042 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2041 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2040 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2039 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2038 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2037 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2036 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2035 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2034 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2033 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2032 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2031 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2030 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2029 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2028 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2027 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2026 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2025 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2024 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2023 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2022 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2021 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2020 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:128 -; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2019 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2018 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2017 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2016 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2015 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2014 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2013 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2012 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2011 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2010 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2009 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2008 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2007 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2006 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2005 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2004 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2003 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2002 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2001 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2000 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1999 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1998 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1997 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1996 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1995 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1994 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1993 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1992 +; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:1991 +; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:1990 +; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:1989 +; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:1988 +; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:1987 +; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:1986 +; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:1985 +; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:1984 +; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:1983 +; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:1982 +; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:1981 +; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:1980 +; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:1979 +; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:1978 +; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:1977 +; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:1976 +; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:1975 +; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:1974 +; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:1973 +; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:1972 +; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:1971 +; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:1970 +; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:1969 +; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:1968 +; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:1967 +; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:1966 +; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:1965 +; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:1964 +; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:1963 +; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:1962 +; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:1961 +; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:1960 +; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:1959 +; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:1958 +; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:1957 +; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:1956 +; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:1955 +; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:1954 +; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:1953 +; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:1952 +; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:1951 +; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:1950 +; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1949 +; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1948 +; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:1947 +; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:1946 +; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:1945 +; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:1944 +; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:1943 +; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:1942 +; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:1941 +; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:1940 +; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:1939 +; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:1938 +; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:1937 +; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:1936 +; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:1935 +; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:1934 +; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1933 +; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1932 +; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:1931 +; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:1930 +; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:1929 +; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:1928 +; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:1927 +; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:1926 +; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:1925 +; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:1924 +; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:1923 +; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:1922 +; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:1921 +; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:1920 +; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:1919 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:126 -; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:124 -; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:123 -; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:122 -; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:121 -; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:120 -; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:119 -; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:118 -; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:117 -; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:116 -; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:115 -; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:114 -; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:113 -; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:112 -; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:111 -; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:110 -; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:109 -; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:108 -; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:107 -; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:106 -; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:105 -; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:104 -; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:103 -; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:102 -; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:101 -; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:100 -; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:99 -; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:98 -; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:97 -; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:96 -; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:95 -; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:94 -; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:93 -; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:92 -; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:91 -; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:90 -; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:89 -; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:88 -; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:84 -; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:83 -; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:81 -; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:80 -; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:78 -; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:76 -; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1918 +; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:1917 +; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:1916 +; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:1915 +; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:1914 +; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:1913 +; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:1912 +; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:1911 +; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:1910 +; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:1909 +; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:1908 +; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:1907 +; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:1906 +; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:1905 +; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:1904 +; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:1903 +; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:1902 +; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:1901 +; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:1900 +; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:1899 +; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:1898 +; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:1897 +; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:1896 +; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:1895 +; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:1894 +; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:1893 +; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:1892 +; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:1891 +; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:1890 +; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:1889 +; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:1888 +; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:1887 +; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:1886 +; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:1885 +; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:1884 +; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:1883 +; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:1882 +; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:1881 +; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:1880 +; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:1879 +; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:1878 +; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:1877 +; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:1876 +; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:1875 +; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:1874 +; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:1873 +; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:1872 +; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:1871 +; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:1870 +; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:1869 +; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:1868 +; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:1867 +; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:1866 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1865 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1864 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1863 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1862 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1861 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1860 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1859 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1858 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1857 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1856 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1855 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1854 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1853 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1852 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1851 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1850 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1849 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1848 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1847 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1846 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1845 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1844 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1843 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1842 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1841 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1840 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1839 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1838 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1837 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1836 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1835 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1834 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1833 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1832 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1831 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1830 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1829 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1828 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1827 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1826 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1825 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1824 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1823 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1822 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1821 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1820 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1819 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1818 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1817 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1816 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1815 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1814 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1813 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1812 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1811 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1810 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1809 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1808 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1807 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1806 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1805 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1804 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1803 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1802 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1801 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1800 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1799 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1798 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1797 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1796 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1795 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1794 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1793 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1792 ; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0xffffff00, v0 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB8_5 ; ALIGNED-NEXT: .LBB8_6: ; %Flow19 @@ -12355,63 +12397,61 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2032 ; UNROLL3-NEXT: s_clause 0x3 -; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2028 -; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2024 -; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2020 -; UNROLL3-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:2016 -; UNROLL3-NEXT: v_add_nc_u32_e32 v1, 0x7b0, v1 -; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v0 +; UNROLL3-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:2028 +; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2024 +; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2020 +; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2016 ; UNROLL3-NEXT: s_waitcnt vmcnt(3) -; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2028 +; UNROLL3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:2028 ; UNROLL3-NEXT: s_waitcnt vmcnt(2) -; UNROLL3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2024 +; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2024 ; UNROLL3-NEXT: s_waitcnt vmcnt(1) -; UNROLL3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2020 +; UNROLL3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2020 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:2016 +; UNROLL3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2016 ; UNROLL3-NEXT: .LBB8_6: ; %memmove_bwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: s_clause 0xb -; UNROLL3-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen offset:44 -; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:40 -; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:36 -; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:32 -; UNROLL3-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:28 -; UNROLL3-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:24 -; UNROLL3-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:20 -; UNROLL3-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:16 -; UNROLL3-NEXT: buffer_load_dword v10, v1, s[0:3], 0 offen offset:12 -; UNROLL3-NEXT: buffer_load_dword v11, v1, s[0:3], 0 offen offset:8 -; UNROLL3-NEXT: buffer_load_dword v12, v1, s[0:3], 0 offen offset:4 -; UNROLL3-NEXT: buffer_load_dword v13, v1, s[0:3], 0 offen +; UNROLL3-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:2012 +; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2008 +; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2004 +; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2000 +; UNROLL3-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:1996 +; UNROLL3-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:1992 +; UNROLL3-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:1988 +; UNROLL3-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:1984 +; UNROLL3-NEXT: buffer_load_dword v10, v1, s[0:3], 0 offen offset:1980 +; UNROLL3-NEXT: buffer_load_dword v11, v1, s[0:3], 0 offen offset:1976 +; UNROLL3-NEXT: buffer_load_dword v12, v1, s[0:3], 0 offen offset:1972 +; UNROLL3-NEXT: buffer_load_dword v13, v1, s[0:3], 0 offen offset:1968 ; UNROLL3-NEXT: v_subrev_nc_u32_e32 v1, 48, v1 ; UNROLL3-NEXT: s_add_u32 s4, s4, 48 ; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 ; UNROLL3-NEXT: s_waitcnt vmcnt(11) -; UNROLL3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen offset:44 +; UNROLL3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:2012 ; UNROLL3-NEXT: s_waitcnt vmcnt(10) -; UNROLL3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:40 +; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2008 ; UNROLL3-NEXT: s_waitcnt vmcnt(9) -; UNROLL3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:36 +; UNROLL3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2004 ; UNROLL3-NEXT: s_waitcnt vmcnt(8) -; UNROLL3-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:32 +; UNROLL3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2000 ; UNROLL3-NEXT: s_waitcnt vmcnt(7) -; UNROLL3-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen offset:28 +; UNROLL3-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:1996 ; UNROLL3-NEXT: s_waitcnt vmcnt(6) -; UNROLL3-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen offset:24 +; UNROLL3-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:1992 ; UNROLL3-NEXT: s_waitcnt vmcnt(5) -; UNROLL3-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen offset:20 +; UNROLL3-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:1988 ; UNROLL3-NEXT: s_waitcnt vmcnt(4) -; UNROLL3-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen offset:16 +; UNROLL3-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:1984 ; UNROLL3-NEXT: s_waitcnt vmcnt(3) -; UNROLL3-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen offset:12 +; UNROLL3-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:1980 ; UNROLL3-NEXT: s_waitcnt vmcnt(2) -; UNROLL3-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:1976 ; UNROLL3-NEXT: s_waitcnt vmcnt(1) -; UNROLL3-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:1972 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; UNROLL3-NEXT: v_subrev_nc_u32_e32 v2, 48, v2 +; UNROLL3-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:1968 +; UNROLL3-NEXT: v_subrev_nc_u32_e32 v0, 48, v0 ; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc0 .LBB8_6 ; UNROLL3-NEXT: .LBB8_7: ; %Flow17 @@ -12427,13 +12467,14 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: s_mov_b32 s4, exec_lo ; CHECK-NEXT: v_cndmask_b32_e32 v3, -1, v0, vcc_lo ; CHECK-NEXT: v_cmpx_ge_u32_e64 v2, v3 -; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 -; CHECK-NEXT: s_cbranch_execz .LBB9_2 -; CHECK-NEXT: .LBB9_1: ; %memmove_fwd_loop +; CHECK-NEXT: s_xor_b32 s6, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execz .LBB9_3 +; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader +; CHECK-NEXT: s_mov_b64 s[4:5], 0x800 +; CHECK-NEXT: .LBB9_2: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32 @@ -12500,145 +12541,143 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:24 ; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo -; CHECK-NEXT: s_add_u32 s4, s4, 0x100 ; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 -; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: s_addc_u32 s5, s5, -1 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[64:67] offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:224 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:208 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:176 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[52:55] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[48:51] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[35:38] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[31:34] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[27:30] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[80:83] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[68:71] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[23:26] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[19:22] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[15:18] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[11:14] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[96:99] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] -; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 -; CHECK-NEXT: s_cbranch_scc1 .LBB9_1 -; CHECK-NEXT: .LBB9_2: ; %Flow10 -; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6 -; CHECK-NEXT: s_cbranch_execz .LBB9_5 -; CHECK-NEXT: ; %bb.3: ; %memmove_bwd_loop.preheader -; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x700, v2 -; CHECK-NEXT: s_movk_i32 s6, 0xff00 -; CHECK-NEXT: s_mov_b64 s[4:5], 0x700 -; CHECK-NEXT: s_mov_b32 s7, -1 -; CHECK-NEXT: .LBB9_4: ; %memmove_bwd_loop +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[84:87] +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB9_2 +; CHECK-NEXT: .LBB9_3: ; %Flow16 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6 +; CHECK-NEXT: s_cbranch_execz .LBB9_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x700, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; CHECK-NEXT: s_movk_i32 s4, 0xf800 +; CHECK-NEXT: s_mov_b32 s5, -1 +; CHECK-NEXT: .LBB9_5: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60 -; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76 -; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92 -; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84 -; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80 -; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64 -; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 -; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236 -; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:252 -; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:248 -; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:244 -; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:240 -; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:232 -; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:228 -; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:224 -; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204 -; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:220 -; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:216 -; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:212 -; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:208 -; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200 -; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196 -; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192 -; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172 -; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:188 -; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:184 -; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:180 -; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:176 -; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168 -; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164 -; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160 -; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:156 -; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:152 -; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:148 -; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:144 -; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:140 -; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:136 -; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:132 -; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:128 -; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:1824 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:1828 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:1832 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:1836 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:1840 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:1844 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:1848 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:1852 +; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:1868 +; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:1884 +; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:1880 +; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:1876 +; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:1872 +; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:1864 +; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:1860 +; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:1856 +; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:1900 +; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:1916 +; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:1912 +; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:1908 +; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:1904 +; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:1896 +; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:1892 +; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:1888 +; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:1964 +; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:1980 +; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:1976 +; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:1972 +; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:1968 +; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:1960 +; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:1956 +; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:1952 +; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:2028 +; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:2044 +; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:2040 +; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:2036 +; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:2032 +; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:2024 +; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:2020 +; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:2016 +; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:1996 +; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:2012 +; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:2008 +; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:2004 +; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:2000 +; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:1992 +; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:1988 +; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:1984 +; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:1948 +; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:1944 +; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:1940 +; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:1936 +; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:1932 +; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:1928 +; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:1924 +; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:1920 +; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen offset:1792 +; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:1796 +; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:1800 +; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen offset:1808 +; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:1812 +; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:1816 +; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:1820 +; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:1804 ; CHECK-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2 -; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 -; CHECK-NEXT: s_addc_u32 s5, s5, -1 -; CHECK-NEXT: s_waitcnt vmcnt(35) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:240 -; CHECK-NEXT: s_waitcnt vmcnt(32) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:224 +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:208 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:176 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:160 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:144 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[48:51] offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(24) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[35:38] offset:224 +; CHECK-NEXT: s_waitcnt vmcnt(19) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[64:67] offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(16) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[52:55] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[31:34] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[27:30] offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(12) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[68:71] offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[80:83] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[23:26] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[19:22] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[15:18] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[11:14] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[96:99] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] -; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_scc0 .LBB9_4 -; CHECK-NEXT: .LBB9_5: ; %Flow11 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[84:87] +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffff00, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0 +; CHECK-NEXT: s_cbranch_scc0 .LBB9_5 +; CHECK-NEXT: .LBB9_6: ; %Flow17 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; @@ -12694,3100 +12733,3109 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 -; ALIGNED-NEXT: s_mov_b32 s6, exec_lo -; ALIGNED-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; ALIGNED-NEXT: v_cmpx_ge_u32_e64 v2, v0 -; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s6 -; ALIGNED-NEXT: s_cbranch_execz .LBB9_2 -; ALIGNED-NEXT: .LBB9_1: ; %memmove_fwd_loop +; ALIGNED-NEXT: s_mov_b32 s4, exec_lo +; ALIGNED-NEXT: v_cndmask_b32_e32 v3, -1, v0, vcc_lo +; ALIGNED-NEXT: v_cmpx_ge_u32_e64 v2, v3 +; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 +; ALIGNED-NEXT: s_cbranch_execz .LBB9_3 +; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x800 +; ALIGNED-NEXT: .LBB9_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: s_clause 0x39 -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:26 ; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31 -; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32 -; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33 -; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:38 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:39 -; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:40 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:41 -; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:42 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:43 -; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:44 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:45 -; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:46 -; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:47 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:48 -; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:49 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:50 -; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:51 -; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:52 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:56 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 -; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65 -; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59 -; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:68 -; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:69 -; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:70 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76 -; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 +; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 +; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0 +; ALIGNED-NEXT: s_waitcnt vmcnt(58) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(57) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(56) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(55) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(54) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(53) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(52) -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(51) -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(50) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(49) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(48) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(47) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(46) -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(45) -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(42) -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v9, 8, v7 +; ALIGNED-NEXT: s_waitcnt vmcnt(43) +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v10 +; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v7, v12, 8, v8 +; ALIGNED-NEXT: v_lshl_or_b32 v8, v13, 8, v14 +; ALIGNED-NEXT: v_lshl_or_b32 v9, v17, 8, v16 +; ALIGNED-NEXT: v_lshl_or_b32 v10, v21, 8, v19 ; ALIGNED-NEXT: s_waitcnt vmcnt(40) -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 -; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 -; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 -; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(39) -; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(37) -; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(35) -; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(33) -; ALIGNED-NEXT: v_lshl_or_b32 v12, v27, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(31) -; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(29) -; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7 -; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 -; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(27) -; ALIGNED-NEXT: v_lshl_or_b32 v15, v30, 8, v29 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(25) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v34 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(23) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v36, 8, v31 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(21) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v33 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(16) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v37 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v38 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v11, v18, 8, v15 +; ALIGNED-NEXT: s_waitcnt vmcnt(38) +; ALIGNED-NEXT: v_lshl_or_b32 v12, v22, 8, v20 +; ALIGNED-NEXT: s_waitcnt vmcnt(36) +; ALIGNED-NEXT: v_lshl_or_b32 v13, v25, 8, v24 +; ALIGNED-NEXT: s_waitcnt vmcnt(34) +; ALIGNED-NEXT: v_lshl_or_b32 v14, v29, 8, v27 +; ALIGNED-NEXT: s_waitcnt vmcnt(32) +; ALIGNED-NEXT: v_lshl_or_b32 v15, v26, 8, v23 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(30) +; ALIGNED-NEXT: v_lshl_or_b32 v16, v30, 8, v28 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 16, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v8, 16, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v10, 16, v9 +; ALIGNED-NEXT: v_lshl_or_b32 v7, v12, 16, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v8, v14, 16, v13 +; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 16, v15 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(28) +; ALIGNED-NEXT: v_lshl_or_b32 v17, v32, 8, v31 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(26) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v35 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(24) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v38, 8, v34 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(22) +; ALIGNED-NEXT: v_lshl_or_b32 v5, v37, 8, v36 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(17) ; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v39 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v7, v51, 8, v52 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v54, 8, v53 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v55, 8, v65 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v64 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v67 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(15) +; ALIGNED-NEXT: v_lshl_or_b32 v7, v49, 8, v48 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v8, v52, 8, v51 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: v_lshl_or_b32 v9, v53, 8, v54 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v3, 16, v17 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v7, 16, v6 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v9, 16, v8 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v55 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v67 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: v_lshl_or_b32 v5, v68, 8, v66 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshl_or_b32 v6, v70, 8, v69 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 16, v5 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v80, 8, v71 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v82, 8, v81 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v83, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:98 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:102 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:103 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:94 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:95 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:93 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:92 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:90 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:89 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:99 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:100 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:97 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:96 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:114 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:118 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:119 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v7 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:110 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:111 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:109 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:107 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:108 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:106 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:117 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:105 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:104 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:115 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:116 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:113 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:112 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:126 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:127 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:123 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:125 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:123 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:122 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:121 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:120 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:129 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:140 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:136 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:144 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v125, 8, v3 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v121 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v126, 8, v124 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 8, v120 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v121, 8, v123 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v104 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v107 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v111 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v108, 8, v109 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v107, 8, v120 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v111, 8, v122 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:171 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v92 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v95, 8, v105 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v88, 8, v90 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v93, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:170 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v79 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v90, 8, v91 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v75 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v92, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:183 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v77, 8, v79 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v61 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v59, 8, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v74, 8, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v72, 8, v76 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v75, 8, v78 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:187 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v57 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v61, 8, v62 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v56 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v59, 8, v60 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:186 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v56, 8, v57 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v42 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v58, 8, v47 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: s_clause 0x5 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:7 ; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v117, 8, v40 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v114, 8, v113 -; ALIGNED-NEXT: v_lshl_or_b32 v110, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v115, 8, v118 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v43, 8, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v118, 8, v119 +; ALIGNED-NEXT: v_lshl_or_b32 v110, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v41, 8, v44 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v117, 8, v40 +; ALIGNED-NEXT: v_lshl_or_b32 v106, v68, 16, v67 ; ALIGNED-NEXT: s_waitcnt vmcnt(61) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v112, 8, v116 -; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v113, 8, v115 ; ALIGNED-NEXT: s_waitcnt vmcnt(59) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v101, 8, v102 -; ALIGNED-NEXT: s_waitcnt vmcnt(57) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v99, 8, v100 -; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v16, 8, v20 -; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v11, 8, v12 -; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v96, 8, v98 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v87 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v85 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v83 -; ALIGNED-NEXT: v_lshl_or_b32 v58, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v84 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v66, 8, v65 -; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v82 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v68 -; ALIGNED-NEXT: v_lshl_or_b32 v119, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v52 -; ALIGNED-NEXT: v_lshl_or_b32 v103, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v50 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v37 -; ALIGNED-NEXT: v_lshl_or_b32 v86, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v35, 8, v38 -; ALIGNED-NEXT: v_lshl_or_b32 v70, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v34 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32 -; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25 -; ALIGNED-NEXT: v_lshl_or_b32 v53, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17 -; ALIGNED-NEXT: v_lshl_or_b32 v31, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v23 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; ALIGNED-NEXT: v_lshl_or_b32 v26, v95, 16, v4 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v13, 8, v14 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v112, 8, v114 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v21, v109, 16, v95 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v8, 8, v10 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v9, 8, v7 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v15, v109, 16, v95 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v95, v95, 8, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v5, 8, v125 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v4, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v94, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v101, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v103, 8, v100 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v89, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v87, 8, v98 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v86, 8, v96 +; ALIGNED-NEXT: v_lshl_or_b32 v63, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v84, 8, v97 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v80, 8, v71 +; ALIGNED-NEXT: v_lshl_or_b32 v46, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v83, 8, v85 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v69, 8, v81 +; ALIGNED-NEXT: v_lshl_or_b32 v42, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v54, 8, v66 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v82, 8, v64 +; ALIGNED-NEXT: v_lshl_or_b32 v116, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v51, 8, v53 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v48, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v99, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v49, 8, v52 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v38, 8, v50 +; ALIGNED-NEXT: v_lshl_or_b32 v70, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v35, 8, v37 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v34, 8, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v65, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v30, 8, v32 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v31, 8, v29 +; ALIGNED-NEXT: v_lshl_or_b32 v55, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v25, 8, v27 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v20, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v33, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v23, 8, v26 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v19, 8, v22 +; ALIGNED-NEXT: v_lshl_or_b32 v28, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v15, 8, v17 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v14, 8, v16 +; ALIGNED-NEXT: v_lshl_or_b32 v24, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v13, 8, v10 +; ALIGNED-NEXT: v_lshl_or_b32 v18, v68, 16, v67 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v3, 8, v4 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v67, v67, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v7, 8, v9 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v6, 8, v8 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 16, v67 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v4, 8, v6 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v68, v5, 8, v8 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 16, v67 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v5, 8, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v5, 8, v9 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v68, v7, 8, v4 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_mov_b32_e32 v5, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 16, v67 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:17 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 -; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v109 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v127, v95, 8, v125 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_add_co_u32 v3, vcc_lo, v3, s4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v4, vcc_lo -; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:250 -; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:251 -; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:249 -; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:255 -; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:253 -; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:254 -; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:252 -; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:248 -; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:242 -; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:243 -; ALIGNED-NEXT: flat_store_byte v[3:4], v22 offset:241 -; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:247 -; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:245 -; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:246 -; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:244 -; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:240 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: flat_store_byte v[0:1], v10 offset:250 +; ALIGNED-NEXT: flat_store_byte v[0:1], v13 offset:251 +; ALIGNED-NEXT: flat_store_byte v[0:1], v11 offset:249 +; ALIGNED-NEXT: flat_store_byte v[0:1], v14 offset:255 +; ALIGNED-NEXT: flat_store_byte v[0:1], v15 offset:253 +; ALIGNED-NEXT: flat_store_byte v[0:1], v16 offset:254 +; ALIGNED-NEXT: flat_store_byte v[0:1], v17 offset:252 +; ALIGNED-NEXT: flat_store_byte v[0:1], v12 offset:248 +; ALIGNED-NEXT: flat_store_byte v[0:1], v21 offset:242 +; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:243 +; ALIGNED-NEXT: flat_store_byte v[0:1], v25 offset:241 +; ALIGNED-NEXT: flat_store_byte v[0:1], v19 offset:247 +; ALIGNED-NEXT: flat_store_byte v[0:1], v23 offset:245 +; ALIGNED-NEXT: flat_store_byte v[0:1], v22 offset:246 +; ALIGNED-NEXT: flat_store_byte v[0:1], v26 offset:244 +; ALIGNED-NEXT: flat_store_byte v[0:1], v27 offset:240 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:234 -; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:235 -; ALIGNED-NEXT: flat_store_byte v[3:4], v27 offset:233 -; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:239 -; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:237 -; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:238 -; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:236 -; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:232 -; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:226 -; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:227 -; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:225 -; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:231 -; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:229 -; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:230 -; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:228 -; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:224 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:213 -; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:215 -; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:209 -; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:211 -; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:210 -; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:214 -; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:212 -; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:218 -; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:219 -; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:217 -; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:223 -; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:221 -; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:222 -; ALIGNED-NEXT: flat_store_byte v[3:4], v82 offset:220 -; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:216 -; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:208 -; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:234 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:235 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:233 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:239 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:237 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:238 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:236 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:232 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:226 +; ALIGNED-NEXT: flat_store_byte v[0:1], v48 offset:227 +; ALIGNED-NEXT: flat_store_byte v[0:1], v51 offset:225 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:231 +; ALIGNED-NEXT: flat_store_byte v[0:1], v49 offset:229 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:230 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:228 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:224 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:209 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:211 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:210 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:213 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:215 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:214 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:212 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:218 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:219 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:217 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:223 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:221 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:222 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:220 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:216 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:208 +; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:202 -; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:203 -; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:201 -; ALIGNED-NEXT: flat_store_byte v[3:4], v99 offset:207 -; ALIGNED-NEXT: flat_store_byte v[3:4], v101 offset:205 -; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:206 -; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:204 -; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:200 -; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:194 -; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:195 -; ALIGNED-NEXT: flat_store_byte v[3:4], v117 offset:193 -; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:199 -; ALIGNED-NEXT: flat_store_byte v[3:4], v115 offset:197 -; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:198 -; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:192 -; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 -; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 -; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:186 -; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:187 -; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:185 -; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:191 -; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:189 -; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:190 -; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:188 -; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:184 -; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:178 -; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:179 -; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:177 -; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:183 -; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:181 -; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:182 -; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:180 -; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:202 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:203 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:201 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:207 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:205 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:206 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:204 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:200 +; ALIGNED-NEXT: flat_store_byte v[0:1], v119 offset:194 +; ALIGNED-NEXT: flat_store_byte v[0:1], v118 offset:195 +; ALIGNED-NEXT: flat_store_byte v[0:1], v43 offset:193 +; ALIGNED-NEXT: flat_store_byte v[0:1], v117 offset:199 +; ALIGNED-NEXT: flat_store_byte v[0:1], v41 offset:197 +; ALIGNED-NEXT: flat_store_byte v[0:1], v40 offset:198 +; ALIGNED-NEXT: flat_store_byte v[0:1], v44 offset:196 +; ALIGNED-NEXT: flat_store_byte v[0:1], v45 offset:192 +; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v127, 8, v68 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v127, v67, 8, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v3, 16, v127 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: flat_store_byte v[0:1], v47 offset:186 +; ALIGNED-NEXT: flat_store_byte v[0:1], v58 offset:187 +; ALIGNED-NEXT: flat_store_byte v[0:1], v56 offset:185 +; ALIGNED-NEXT: flat_store_byte v[0:1], v59 offset:191 +; ALIGNED-NEXT: flat_store_byte v[0:1], v61 offset:189 +; ALIGNED-NEXT: flat_store_byte v[0:1], v60 offset:190 +; ALIGNED-NEXT: flat_store_byte v[0:1], v62 offset:188 +; ALIGNED-NEXT: flat_store_byte v[0:1], v57 offset:184 +; ALIGNED-NEXT: flat_store_byte v[0:1], v73 offset:178 +; ALIGNED-NEXT: flat_store_byte v[0:1], v74 offset:179 +; ALIGNED-NEXT: flat_store_byte v[0:1], v77 offset:177 +; ALIGNED-NEXT: flat_store_byte v[0:1], v72 offset:183 +; ALIGNED-NEXT: flat_store_byte v[0:1], v75 offset:181 +; ALIGNED-NEXT: flat_store_byte v[0:1], v76 offset:182 +; ALIGNED-NEXT: flat_store_byte v[0:1], v78 offset:180 +; ALIGNED-NEXT: flat_store_byte v[0:1], v79 offset:176 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:170 -; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:171 -; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:169 -; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:175 -; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:173 -; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:174 -; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:172 -; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:168 -; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:162 -; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:163 -; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:161 -; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:167 -; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:165 -; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:166 -; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:164 -; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:160 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: flat_store_byte v[0:1], v88 offset:170 +; ALIGNED-NEXT: flat_store_byte v[0:1], v92 offset:171 +; ALIGNED-NEXT: flat_store_byte v[0:1], v90 offset:169 +; ALIGNED-NEXT: flat_store_byte v[0:1], v93 offset:175 +; ALIGNED-NEXT: flat_store_byte v[0:1], v95 offset:173 +; ALIGNED-NEXT: flat_store_byte v[0:1], v104 offset:174 +; ALIGNED-NEXT: flat_store_byte v[0:1], v105 offset:172 +; ALIGNED-NEXT: flat_store_byte v[0:1], v91 offset:168 +; ALIGNED-NEXT: flat_store_byte v[0:1], v109 offset:162 +; ALIGNED-NEXT: flat_store_byte v[0:1], v108 offset:163 +; ALIGNED-NEXT: flat_store_byte v[0:1], v121 offset:161 +; ALIGNED-NEXT: flat_store_byte v[0:1], v107 offset:167 +; ALIGNED-NEXT: flat_store_byte v[0:1], v111 offset:165 +; ALIGNED-NEXT: flat_store_byte v[0:1], v120 offset:166 +; ALIGNED-NEXT: flat_store_byte v[0:1], v122 offset:164 +; ALIGNED-NEXT: flat_store_byte v[0:1], v123 offset:160 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:153 -; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:159 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: flat_store_byte v[0:1], v124 offset:154 +; ALIGNED-NEXT: flat_store_byte v[0:1], v126 offset:155 +; ALIGNED-NEXT: flat_store_byte v[0:1], v125 offset:153 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:158 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:159 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:156 -; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:152 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:157 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:146 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:158 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:147 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:156 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:145 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:152 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:151 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:146 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:149 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:147 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:145 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:148 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:151 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:144 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:149 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:150 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:148 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:144 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:138 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:139 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:137 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:143 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:138 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:141 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:139 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:142 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:137 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:140 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:143 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:136 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:141 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:130 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:142 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:131 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:140 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:129 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:136 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:135 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:130 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:133 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:131 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:134 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:129 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:132 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:135 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:128 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:133 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:134 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:132 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:128 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:122 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:123 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:121 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:127 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:122 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1232 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:125 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:123 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:126 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:121 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:124 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:127 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:120 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:125 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:114 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:126 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:115 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:124 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:113 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:120 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:119 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:114 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:117 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:115 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:118 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:113 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:116 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:119 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:112 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:117 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:118 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:380 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:116 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:112 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:106 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:107 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:105 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:111 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:106 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:109 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:107 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:110 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:105 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:108 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:111 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:104 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:109 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:98 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:110 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:99 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:108 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:97 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:104 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:103 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:98 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:101 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:99 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:102 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:97 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:100 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:103 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:96 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:101 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:102 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:100 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:96 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:90 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:91 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:89 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:95 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:90 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:93 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:91 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:94 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:89 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:92 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:95 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:88 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:93 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:82 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:94 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:83 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:92 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:81 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:88 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:87 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:82 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:85 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:83 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:86 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:81 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:84 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:87 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:80 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:85 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:86 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:84 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:80 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:74 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:75 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:73 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:79 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:74 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:77 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:75 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:78 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:73 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:76 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:79 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:72 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:77 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:66 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:78 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:67 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:76 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:65 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:72 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:71 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:66 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:69 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:67 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:70 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:65 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:68 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:71 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:64 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:69 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:70 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:68 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:64 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:61 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:58 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:59 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:57 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:61 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:63 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:58 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:62 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:59 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:60 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:57 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:56 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:63 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:53 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:62 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:50 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:60 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:51 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:56 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:49 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:53 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:55 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:50 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:54 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:51 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:52 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:49 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:48 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:55 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:54 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:52 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:48 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:43 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:42 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:41 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:40 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:43 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:47 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:42 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:46 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:41 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:45 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:40 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:44 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:47 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:35 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:46 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:34 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:45 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:33 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:44 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:32 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:35 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:39 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:34 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:38 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:33 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:37 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:32 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:36 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:39 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:38 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:37 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:36 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:27 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:25 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:31 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:26 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:29 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:27 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:30 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:25 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:28 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:31 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:18 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:29 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:17 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:30 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:28 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:21 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:24 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:18 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:22 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:19 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:17 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:16 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:23 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:21 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:22 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:20 +; ALIGNED-NEXT: flat_store_byte v[0:1], v4 offset:16 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:10 -; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:11 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:9 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:15 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: flat_store_byte v[0:1], v5 offset:10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v7 offset:11 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:14 -; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:12 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:13 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:8 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:9 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:15 +; ALIGNED-NEXT: flat_store_byte v[0:1], v8 offset:14 +; ALIGNED-NEXT: flat_store_byte v[0:1], v6 offset:12 +; ALIGNED-NEXT: flat_store_byte v[0:1], v9 offset:8 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:3 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:2 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:1 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:3 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:7 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:1 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:5 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:7 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:6 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:5 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:4 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:6 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 -; ALIGNED-NEXT: s_cbranch_scc1 .LBB9_1 -; ALIGNED-NEXT: .LBB9_2: ; %Flow10 -; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 -; ALIGNED-NEXT: s_cbranch_execz .LBB9_5 -; ALIGNED-NEXT: ; %bb.3: ; %memmove_bwd_loop.preheader -; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0x700, v2 -; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 -; ALIGNED-NEXT: s_mov_b32 s7, -1 -; ALIGNED-NEXT: .LBB9_4: ; %memmove_bwd_loop +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:4 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 +; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; ALIGNED-NEXT: s_cbranch_scc1 .LBB9_2 +; ALIGNED-NEXT: .LBB9_3: ; %Flow16 +; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6 +; ALIGNED-NEXT: s_cbranch_execz .LBB9_6 +; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader +; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0x700, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; ALIGNED-NEXT: s_movk_i32 s4, 0xf800 +; ALIGNED-NEXT: s_mov_b32 s5, -1 +; ALIGNED-NEXT: .LBB9_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: s_clause 0x39 -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:23 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24 -; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25 -; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:19 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:29 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:30 -; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:31 -; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:32 -; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:33 -; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:34 -; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27 -; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:35 -; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:36 -; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:37 -; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:38 -; ALIGNED-NEXT: buffer_load_ubyte v20, v4, s[0:3], 0 offen offset:39 -; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:40 -; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:41 -; ALIGNED-NEXT: buffer_load_ubyte v25, v4, s[0:3], 0 offen offset:42 -; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:43 -; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:44 -; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:45 -; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:46 -; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:47 -; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:48 -; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:49 -; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:50 -; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:51 -; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:52 -; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:53 -; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:54 -; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58 -; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:60 -; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:61 -; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:62 -; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:63 -; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:64 -; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:65 -; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:66 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59 -; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:67 -; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:68 -; ALIGNED-NEXT: buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:69 -; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:70 -; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:71 -; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:76 -; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78 -; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1812 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1813 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1814 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1815 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1816 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:1817 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:1818 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:1811 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1820 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1821 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:1822 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:1823 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:1824 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1825 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:1826 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:1819 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:1827 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:1828 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:1829 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:1830 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:1831 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:1832 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:1833 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:1834 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:1835 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:1836 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:1837 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:1838 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:1839 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:1840 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:1841 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:1842 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:1843 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:1844 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:1845 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:1846 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:1847 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:1848 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:1849 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:1850 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:1852 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:1853 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:1854 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:1855 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:1856 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:1857 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:1858 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:1851 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:1859 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:1860 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:1861 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:1862 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:1863 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:1868 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:1869 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:1867 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:1870 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:1871 +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(57) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(56) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(55) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(54) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(53) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(52) -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(51) -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 ; ALIGNED-NEXT: s_waitcnt vmcnt(49) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(48) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(47) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(46) -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(45) -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v5 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v9, 8, v7 ; ALIGNED-NEXT: s_waitcnt vmcnt(42) -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v10 ; ALIGNED-NEXT: s_waitcnt vmcnt(40) -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 -; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 -; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 -; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v7, v12, 8, v8 +; ALIGNED-NEXT: v_lshl_or_b32 v8, v13, 8, v14 +; ALIGNED-NEXT: v_lshl_or_b32 v9, v17, 8, v16 +; ALIGNED-NEXT: v_lshl_or_b32 v10, v21, 8, v19 ; ALIGNED-NEXT: s_waitcnt vmcnt(39) -; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 +; ALIGNED-NEXT: v_lshl_or_b32 v11, v18, 8, v15 ; ALIGNED-NEXT: s_waitcnt vmcnt(37) -; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 +; ALIGNED-NEXT: v_lshl_or_b32 v12, v22, 8, v20 ; ALIGNED-NEXT: s_waitcnt vmcnt(35) -; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 +; ALIGNED-NEXT: v_lshl_or_b32 v13, v26, 8, v24 ; ALIGNED-NEXT: s_waitcnt vmcnt(33) -; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 +; ALIGNED-NEXT: v_lshl_or_b32 v14, v29, 8, v28 ; ALIGNED-NEXT: s_waitcnt vmcnt(31) -; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v15, v25, 8, v23 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_waitcnt vmcnt(29) -; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 -; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v16, v30, 8, v27 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 16, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v8, 16, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v10, 16, v9 +; ALIGNED-NEXT: v_lshl_or_b32 v7, v12, 16, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v8, v14, 16, v13 +; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 16, v15 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(27) -; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v17, v33, 8, v32 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(25) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(23) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v39, 8, v34 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(21) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v5, v38, 8, v37 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(16) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v38 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v6, v53, 8, v48 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v7, v51, 8, v49 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v8, v52, 8, v50 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v5, 16, v3 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v9, v55, 8, v54 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v3, 16, v17 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v7, 16, v6 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1877 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v9, 16, v8 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v31 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v68 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v5, v66, 8, v64 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v70, 8, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1878 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1874 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 16, v5 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v80, 8, v71 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1875 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1866 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:73 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:72 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v82, 8, v81 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1865 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1864 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:1879 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:84 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:81 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v83, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1876 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1873 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1872 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:98 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1890 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:102 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:103 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1894 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:1895 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:94 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:95 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1886 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1887 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:93 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:91 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1885 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1883 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1884 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:90 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:101 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:89 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1882 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1893 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1881 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1880 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:99 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:100 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:97 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1891 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1892 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1889 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1888 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:114 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1906 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:118 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:119 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1910 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:1911 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:110 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:111 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1902 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1903 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:109 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:107 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1901 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1899 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1900 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:106 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:117 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:105 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1898 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1909 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1897 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1896 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:115 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:116 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:113 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1907 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1908 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1905 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1904 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1922 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1926 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:1927 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:126 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:127 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1918 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1919 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:123 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1917 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1915 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1916 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:122 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:121 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1914 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1925 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1913 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1912 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1923 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1924 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1921 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1920 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1938 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1942 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:1943 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1934 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1935 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1933 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1931 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1932 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1930 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1941 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1929 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1928 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1939 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1940 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1937 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1936 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1950 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1951 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1949 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1947 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1948 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:1944 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:1945 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:1946 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v123 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v124, 8, v127 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v122 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:1952 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:1953 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1954 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:1955 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:1956 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:1957 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:1958 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:1959 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v108, 8, v110 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v107 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v104, 8, v95 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v93, 8, v106 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v107, 8, v109 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:1964 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:1965 +; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:1966 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:1967 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:1963 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v90, 8, v91 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v79, 8, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:1960 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:1961 +; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:1962 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v76, 8, v78 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v77, 8, v74 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:1968 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:1969 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:1970 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:1971 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:1972 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:1973 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:1974 +; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:1975 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v63, 8, v73 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v59, 8, v60 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v58, 8, v61 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v62, 8, v72 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:1980 +; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:1981 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:1982 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:1983 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:1979 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v46, 8, v56 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v45, 8, v47 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:1976 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:1977 +; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:1978 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v42, 8, v44 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v43, 8, v41 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1f +; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:1984 +; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:1985 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:1986 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:1987 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:1988 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:1989 +; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:1990 +; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:1991 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:1996 +; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:1997 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:1998 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:1999 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:1995 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:1992 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:1993 +; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:1994 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:2004 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:2005 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:2006 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:2007 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:2003 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:2008 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:2009 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:2010 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:2011 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:2012 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:2013 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:2014 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:2015 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:2000 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:2001 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:2002 +; ALIGNED-NEXT: s_waitcnt vmcnt(30) +; ALIGNED-NEXT: v_lshl_or_b32 v35, v117, 8, v119 +; ALIGNED-NEXT: s_waitcnt vmcnt(28) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v113, 8, v112 +; ALIGNED-NEXT: s_waitcnt vmcnt(20) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v98, 8, v100 +; ALIGNED-NEXT: v_lshl_or_b32 v105, v36, 16, v35 +; ALIGNED-NEXT: v_lshl_or_b32 v35, v114, 8, v118 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v103, 8, v115 +; ALIGNED-NEXT: v_lshl_or_b32 v92, v36, 16, v35 +; ALIGNED-NEXT: s_clause 0x1f +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:2016 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:2017 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:2018 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:2019 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:2020 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:2021 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:2022 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:2023 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:2028 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:2029 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:2030 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:2031 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:2027 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:2024 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:2025 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:2026 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:2032 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:2033 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:2034 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:2035 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:2036 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:2037 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2038 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:2039 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:2044 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:2045 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:2046 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:2047 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:2043 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:2040 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2041 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:2042 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v99, 8, v101 +; ALIGNED-NEXT: s_clause 0x5 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1792 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:1794 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:1796 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:1797 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:1798 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:1799 +; ALIGNED-NEXT: v_lshl_or_b32 v88, v50, 16, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(55) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v87, 8, v96 +; ALIGNED-NEXT: s_waitcnt vmcnt(54) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v97, 8, v86 +; ALIGNED-NEXT: v_lshl_or_b32 v75, v50, 16, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(52) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v81, 8, v84 +; ALIGNED-NEXT: s_waitcnt vmcnt(50) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v80, 8, v82 +; ALIGNED-NEXT: v_lshl_or_b32 v57, v50, 16, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(47) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v70, 8, v83 +; ALIGNED-NEXT: s_waitcnt vmcnt(45) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v66, 8, v65 +; ALIGNED-NEXT: v_lshl_or_b32 v40, v50, 16, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(43) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v69, 8, v71 +; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v55, 8, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v116, v50, 16, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(39) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v49, 8, v54 +; ALIGNED-NEXT: s_waitcnt vmcnt(38) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v68, 8, v52 +; ALIGNED-NEXT: v_lshl_or_b32 v102, v50, 16, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(36) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v38, 8, v48 +; ALIGNED-NEXT: s_waitcnt vmcnt(34) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v34, 8, v33 +; ALIGNED-NEXT: v_lshl_or_b32 v85, v50, 16, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(32) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v35, 8, v39 +; ALIGNED-NEXT: s_waitcnt vmcnt(30) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v32, 8, v37 +; ALIGNED-NEXT: v_lshl_or_b32 v64, v50, 16, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(28) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v29, 8, v31 +; ALIGNED-NEXT: s_waitcnt vmcnt(26) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v28, 8, v30 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v111, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v53, v50, 16, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v24, 8, v26 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v25, 8, v23 +; ALIGNED-NEXT: v_lshl_or_b32 v51, v50, 16, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v19, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v14, 8, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v27, v50, 16, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v17, 8, v20 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v13, 8, v16 +; ALIGNED-NEXT: v_lshl_or_b32 v22, v50, 16, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v9, 8, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v8, 8, v10 +; ALIGNED-NEXT: v_lshl_or_b32 v18, v50, 16, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v5, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v7, 8, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v12, v50, 16, v36 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:1793 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:1795 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v36, v36, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v50, 8, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 16, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v111, 8, v94 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v123, 8, v120 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:1806 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 16, v36 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:1805 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1804 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v117 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: s_clause 0x6 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v112, 8, v115 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v101 -; ALIGNED-NEXT: v_lshl_or_b32 v106, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v103, 8, v113 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v102 -; ALIGNED-NEXT: v_lshl_or_b32 v90, v3, 16, v2 -; ALIGNED-NEXT: s_waitcnt vmcnt(60) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97 -; ALIGNED-NEXT: s_waitcnt vmcnt(58) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96 -; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v12, 8, v16 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v8, 8, v10 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v88, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v83, 8, v84 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v82 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v73, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v68, 8, v80 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v70 -; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v71 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v53 -; ALIGNED-NEXT: v_lshl_or_b32 v116, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v55, 8, v69 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 -; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v50 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v39 -; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31 -; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34 -; ALIGNED-NEXT: v_lshl_or_b32 v64, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v27, 8, v29 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28 -; ALIGNED-NEXT: v_lshl_or_b32 v49, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v22, 8, v24 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v21 -; ALIGNED-NEXT: v_lshl_or_b32 v48, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v17, 8, v19 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v13 -; ALIGNED-NEXT: v_lshl_or_b32 v25, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v15, 8, v18 -; ALIGNED-NEXT: v_lshl_or_b32 v20, v62, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v62, v9, 8, v11 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v76, 16, v62 -; ALIGNED-NEXT: v_lshl_or_b32 v62, v5, 8, v6 -; ALIGNED-NEXT: v_lshl_or_b32 v76, v7, 8, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v76, 16, v62 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v62, v62, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v76, v76, 8, v120 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: v_lshl_or_b32 v62, v109, 8, v104 -; ALIGNED-NEXT: v_lshl_or_b32 v76, v122, 8, v110 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:1807 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:1803 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v36, 8, v3 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76 -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v50, v126, 8, v94 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 16, v36 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:1800 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:1801 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:1802 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v110, 8, v122 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v120, 8, v123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v120, 8, v109 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v50, v121, 8, v111 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 16, v36 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:17 -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 -; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 -; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0xffffff00, v4 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v125, 8, v62 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v125, v76, 8, v104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v2, s4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v3, vcc_lo -; ALIGNED-NEXT: flat_store_byte v[2:3], v1 offset:250 -; ALIGNED-NEXT: flat_store_byte v[2:3], v7 offset:251 -; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:249 -; ALIGNED-NEXT: flat_store_byte v[2:3], v8 offset:255 -; ALIGNED-NEXT: flat_store_byte v[2:3], v9 offset:253 -; ALIGNED-NEXT: flat_store_byte v[2:3], v10 offset:254 -; ALIGNED-NEXT: flat_store_byte v[2:3], v11 offset:252 -; ALIGNED-NEXT: flat_store_byte v[2:3], v6 offset:248 -; ALIGNED-NEXT: flat_store_byte v[2:3], v13 offset:242 -; ALIGNED-NEXT: flat_store_byte v[2:3], v14 offset:243 -; ALIGNED-NEXT: flat_store_byte v[2:3], v17 offset:241 -; ALIGNED-NEXT: flat_store_byte v[2:3], v12 offset:247 -; ALIGNED-NEXT: flat_store_byte v[2:3], v15 offset:245 -; ALIGNED-NEXT: flat_store_byte v[2:3], v16 offset:246 -; ALIGNED-NEXT: flat_store_byte v[2:3], v18 offset:244 -; ALIGNED-NEXT: flat_store_byte v[2:3], v19 offset:240 -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:1810 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:1808 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:1809 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: flat_store_byte v[0:1], v4 offset:250 +; ALIGNED-NEXT: flat_store_byte v[0:1], v7 offset:251 +; ALIGNED-NEXT: flat_store_byte v[0:1], v5 offset:249 +; ALIGNED-NEXT: flat_store_byte v[0:1], v8 offset:255 +; ALIGNED-NEXT: flat_store_byte v[0:1], v9 offset:253 +; ALIGNED-NEXT: flat_store_byte v[0:1], v10 offset:254 +; ALIGNED-NEXT: flat_store_byte v[0:1], v11 offset:252 +; ALIGNED-NEXT: flat_store_byte v[0:1], v6 offset:248 +; ALIGNED-NEXT: flat_store_byte v[0:1], v15 offset:242 +; ALIGNED-NEXT: flat_store_byte v[0:1], v14 offset:243 +; ALIGNED-NEXT: flat_store_byte v[0:1], v19 offset:241 +; ALIGNED-NEXT: flat_store_byte v[0:1], v13 offset:247 +; ALIGNED-NEXT: flat_store_byte v[0:1], v17 offset:245 +; ALIGNED-NEXT: flat_store_byte v[0:1], v16 offset:246 +; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:244 +; ALIGNED-NEXT: flat_store_byte v[0:1], v21 offset:240 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:508 ; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte v[2:3], v21 offset:234 -; ALIGNED-NEXT: flat_store_byte v[2:3], v23 offset:235 -; ALIGNED-NEXT: flat_store_byte v[2:3], v22 offset:233 -; ALIGNED-NEXT: flat_store_byte v[2:3], v26 offset:239 -; ALIGNED-NEXT: flat_store_byte v[2:3], v27 offset:237 -; ALIGNED-NEXT: flat_store_byte v[2:3], v28 offset:238 -; ALIGNED-NEXT: flat_store_byte v[2:3], v29 offset:236 -; ALIGNED-NEXT: flat_store_byte v[2:3], v24 offset:232 -; ALIGNED-NEXT: flat_store_byte v[2:3], v31 offset:226 -; ALIGNED-NEXT: flat_store_byte v[2:3], v32 offset:227 -; ALIGNED-NEXT: flat_store_byte v[2:3], v35 offset:225 -; ALIGNED-NEXT: flat_store_byte v[2:3], v30 offset:231 -; ALIGNED-NEXT: flat_store_byte v[2:3], v33 offset:229 -; ALIGNED-NEXT: flat_store_byte v[2:3], v34 offset:230 -; ALIGNED-NEXT: flat_store_byte v[2:3], v36 offset:228 -; ALIGNED-NEXT: flat_store_byte v[2:3], v37 offset:224 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: flat_store_byte v[2:3], v68 offset:213 -; ALIGNED-NEXT: flat_store_byte v[2:3], v65 offset:215 -; ALIGNED-NEXT: flat_store_byte v[2:3], v38 offset:209 -; ALIGNED-NEXT: flat_store_byte v[2:3], v66 offset:211 -; ALIGNED-NEXT: flat_store_byte v[2:3], v39 offset:210 -; ALIGNED-NEXT: flat_store_byte v[2:3], v70 offset:214 -; ALIGNED-NEXT: flat_store_byte v[2:3], v80 offset:212 -; ALIGNED-NEXT: flat_store_byte v[2:3], v53 offset:218 -; ALIGNED-NEXT: flat_store_byte v[2:3], v52 offset:219 -; ALIGNED-NEXT: flat_store_byte v[2:3], v67 offset:217 -; ALIGNED-NEXT: flat_store_byte v[2:3], v51 offset:223 -; ALIGNED-NEXT: flat_store_byte v[2:3], v55 offset:221 -; ALIGNED-NEXT: flat_store_byte v[2:3], v54 offset:222 -; ALIGNED-NEXT: flat_store_byte v[2:3], v69 offset:220 -; ALIGNED-NEXT: flat_store_byte v[2:3], v71 offset:216 -; ALIGNED-NEXT: flat_store_byte v[2:3], v50 offset:208 -; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: flat_store_byte v[0:1], v23 offset:234 +; ALIGNED-NEXT: flat_store_byte v[0:1], v25 offset:235 +; ALIGNED-NEXT: flat_store_byte v[0:1], v24 offset:233 +; ALIGNED-NEXT: flat_store_byte v[0:1], v28 offset:239 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:237 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:238 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:236 +; ALIGNED-NEXT: flat_store_byte v[0:1], v26 offset:232 +; ALIGNED-NEXT: flat_store_byte v[0:1], v33 offset:226 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:227 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:225 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:231 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:229 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:230 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:228 +; ALIGNED-NEXT: flat_store_byte v[0:1], v48 offset:224 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: flat_store_byte v[0:1], v49 offset:209 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:211 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:210 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:213 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:215 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:214 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:212 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:218 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:219 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:217 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:223 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:221 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:222 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:220 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:216 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:208 +; ALIGNED-NEXT: buffer_store_dword v75, off, s[0:3], s32 offset:472 ; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:476 -; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: flat_store_byte v[2:3], v82 offset:202 -; ALIGNED-NEXT: flat_store_byte v[2:3], v85 offset:203 -; ALIGNED-NEXT: flat_store_byte v[2:3], v83 offset:201 -; ALIGNED-NEXT: flat_store_byte v[2:3], v86 offset:207 -; ALIGNED-NEXT: flat_store_byte v[2:3], v87 offset:205 -; ALIGNED-NEXT: flat_store_byte v[2:3], v96 offset:206 -; ALIGNED-NEXT: flat_store_byte v[2:3], v97 offset:204 -; ALIGNED-NEXT: flat_store_byte v[2:3], v84 offset:200 -; ALIGNED-NEXT: flat_store_byte v[2:3], v101 offset:194 -; ALIGNED-NEXT: flat_store_byte v[2:3], v100 offset:195 -; ALIGNED-NEXT: flat_store_byte v[2:3], v112 offset:193 -; ALIGNED-NEXT: flat_store_byte v[2:3], v99 offset:199 -; ALIGNED-NEXT: flat_store_byte v[2:3], v103 offset:197 -; ALIGNED-NEXT: flat_store_byte v[2:3], v102 offset:198 -; ALIGNED-NEXT: flat_store_byte v[2:3], v113 offset:196 -; ALIGNED-NEXT: flat_store_byte v[2:3], v115 offset:192 -; ALIGNED-NEXT: v_lshl_or_b32 v125, v0, 16, v125 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 -; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 -; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 -; ALIGNED-NEXT: flat_store_byte v[2:3], v117 offset:186 -; ALIGNED-NEXT: flat_store_byte v[2:3], v40 offset:187 -; ALIGNED-NEXT: flat_store_byte v[2:3], v118 offset:185 -; ALIGNED-NEXT: flat_store_byte v[2:3], v41 offset:191 -; ALIGNED-NEXT: flat_store_byte v[2:3], v43 offset:189 -; ALIGNED-NEXT: flat_store_byte v[2:3], v42 offset:190 -; ALIGNED-NEXT: flat_store_byte v[2:3], v44 offset:188 -; ALIGNED-NEXT: flat_store_byte v[2:3], v119 offset:184 -; ALIGNED-NEXT: flat_store_byte v[2:3], v47 offset:178 -; ALIGNED-NEXT: flat_store_byte v[2:3], v56 offset:179 -; ALIGNED-NEXT: flat_store_byte v[2:3], v59 offset:177 -; ALIGNED-NEXT: flat_store_byte v[2:3], v46 offset:183 -; ALIGNED-NEXT: flat_store_byte v[2:3], v57 offset:181 -; ALIGNED-NEXT: flat_store_byte v[2:3], v58 offset:182 -; ALIGNED-NEXT: flat_store_byte v[2:3], v60 offset:180 -; ALIGNED-NEXT: flat_store_byte v[2:3], v61 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 -; ALIGNED-NEXT: flat_store_byte v[2:3], v63 offset:170 -; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:171 -; ALIGNED-NEXT: flat_store_byte v[2:3], v72 offset:169 -; ALIGNED-NEXT: flat_store_byte v[2:3], v77 offset:175 -; ALIGNED-NEXT: flat_store_byte v[2:3], v79 offset:173 -; ALIGNED-NEXT: flat_store_byte v[2:3], v78 offset:174 -; ALIGNED-NEXT: flat_store_byte v[2:3], v89 offset:172 -; ALIGNED-NEXT: flat_store_byte v[2:3], v74 offset:168 -; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:162 -; ALIGNED-NEXT: flat_store_byte v[2:3], v92 offset:163 -; ALIGNED-NEXT: flat_store_byte v[2:3], v105 offset:161 -; ALIGNED-NEXT: flat_store_byte v[2:3], v91 offset:167 -; ALIGNED-NEXT: flat_store_byte v[2:3], v95 offset:165 -; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:166 -; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:164 -; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:160 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_store_dword v105, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:202 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:203 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:201 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:207 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:205 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:206 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:204 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:200 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:194 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:195 +; ALIGNED-NEXT: flat_store_byte v[0:1], v117 offset:193 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:199 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:197 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:198 +; ALIGNED-NEXT: flat_store_byte v[0:1], v118 offset:196 +; ALIGNED-NEXT: flat_store_byte v[0:1], v119 offset:192 +; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v125, 8, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v125, v50, 8, v94 +; ALIGNED-NEXT: v_lshl_or_b32 v125, v3, 16, v125 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: flat_store_byte v[0:1], v41 offset:186 +; ALIGNED-NEXT: flat_store_byte v[0:1], v43 offset:187 +; ALIGNED-NEXT: flat_store_byte v[0:1], v42 offset:185 +; ALIGNED-NEXT: flat_store_byte v[0:1], v45 offset:191 +; ALIGNED-NEXT: flat_store_byte v[0:1], v46 offset:189 +; ALIGNED-NEXT: flat_store_byte v[0:1], v47 offset:190 +; ALIGNED-NEXT: flat_store_byte v[0:1], v56 offset:188 +; ALIGNED-NEXT: flat_store_byte v[0:1], v44 offset:184 +; ALIGNED-NEXT: flat_store_byte v[0:1], v60 offset:178 +; ALIGNED-NEXT: flat_store_byte v[0:1], v59 offset:179 +; ALIGNED-NEXT: flat_store_byte v[0:1], v63 offset:177 +; ALIGNED-NEXT: flat_store_byte v[0:1], v58 offset:183 +; ALIGNED-NEXT: flat_store_byte v[0:1], v62 offset:181 +; ALIGNED-NEXT: flat_store_byte v[0:1], v61 offset:182 +; ALIGNED-NEXT: flat_store_byte v[0:1], v72 offset:180 +; ALIGNED-NEXT: flat_store_byte v[0:1], v73 offset:176 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:572 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:564 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:560 +; ALIGNED-NEXT: flat_store_byte v[0:1], v74 offset:170 +; ALIGNED-NEXT: flat_store_byte v[0:1], v77 offset:171 +; ALIGNED-NEXT: flat_store_byte v[0:1], v76 offset:169 +; ALIGNED-NEXT: flat_store_byte v[0:1], v79 offset:175 +; ALIGNED-NEXT: flat_store_byte v[0:1], v90 offset:173 +; ALIGNED-NEXT: flat_store_byte v[0:1], v89 offset:174 +; ALIGNED-NEXT: flat_store_byte v[0:1], v91 offset:172 +; ALIGNED-NEXT: flat_store_byte v[0:1], v78 offset:168 +; ALIGNED-NEXT: flat_store_byte v[0:1], v95 offset:162 +; ALIGNED-NEXT: flat_store_byte v[0:1], v104 offset:163 +; ALIGNED-NEXT: flat_store_byte v[0:1], v108 offset:161 +; ALIGNED-NEXT: flat_store_byte v[0:1], v93 offset:167 +; ALIGNED-NEXT: flat_store_byte v[0:1], v107 offset:165 +; ALIGNED-NEXT: flat_store_byte v[0:1], v106 offset:166 +; ALIGNED-NEXT: flat_store_byte v[0:1], v109 offset:164 +; ALIGNED-NEXT: flat_store_byte v[0:1], v110 offset:160 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v122 offset:154 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:155 +; ALIGNED-NEXT: flat_store_byte v[0:1], v124 offset:153 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:154 -; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:153 -; ALIGNED-NEXT: flat_store_byte v[2:3], v126 offset:159 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:159 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:157 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:157 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:158 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:158 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:156 -; ALIGNED-NEXT: flat_store_byte v[2:3], v123 offset:152 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:156 +; ALIGNED-NEXT: flat_store_byte v[0:1], v127 offset:152 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:146 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:146 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:147 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:147 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:145 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:145 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:151 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:151 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:149 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:149 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:150 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:148 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:148 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:144 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:144 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:138 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:138 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:139 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:139 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:137 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:137 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:143 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:143 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:141 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:141 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:142 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:142 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:140 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:140 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:136 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:136 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:130 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:130 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:131 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:131 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:129 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:129 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:135 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:135 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:133 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:133 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:134 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:134 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:132 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:132 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:128 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:128 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:616 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:612 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:608 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:122 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:122 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1232 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:123 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:123 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:121 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:121 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:127 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:127 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:125 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:125 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:126 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:126 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:124 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:124 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:120 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:120 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:114 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:114 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:115 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:115 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:113 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:113 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:119 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:119 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:117 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:117 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:118 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:118 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:116 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:116 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:112 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:112 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:632 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:636 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:624 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:106 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:106 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:107 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:107 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:105 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:105 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:111 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:111 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:109 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:109 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:110 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:110 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:108 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:108 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:104 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:104 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:98 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:98 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:99 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:99 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:97 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:97 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:103 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:103 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:101 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:101 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:102 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:102 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:100 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:100 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:96 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:96 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:584 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:576 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:90 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:90 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:91 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:91 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:89 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:89 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:95 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:95 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:93 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:93 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:94 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:94 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:92 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:92 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:88 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:88 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:82 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:82 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:83 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:83 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:81 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:81 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:87 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:87 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:85 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:85 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:86 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:86 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:84 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:84 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:80 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:80 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:604 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:596 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:74 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:74 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:75 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:75 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:73 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:73 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:79 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:79 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:77 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:77 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:78 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:78 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:76 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:76 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:72 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:72 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:66 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:66 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:67 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:67 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:65 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:65 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:71 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:71 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:69 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:69 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:70 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:70 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:68 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:68 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:64 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:64 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:680 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:684 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:676 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:672 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:61 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:61 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:58 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:58 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:59 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:59 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:57 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:57 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:63 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:63 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:62 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:62 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:60 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:60 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:56 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:56 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:53 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:53 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:50 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:50 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:51 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:51 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:49 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:49 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:55 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:55 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:54 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:54 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:52 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:52 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:48 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:48 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:700 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:696 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:692 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:688 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:43 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:43 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:42 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:42 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:41 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:41 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:40 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:40 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:47 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:47 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:46 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:46 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:45 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:45 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:44 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:44 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:35 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:35 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:34 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:34 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:33 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:33 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:32 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:32 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:39 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:39 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:38 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:38 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:37 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:37 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:36 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:36 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:648 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:652 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:644 ; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:640 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:26 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:26 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:27 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:27 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:25 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:25 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:31 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:31 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:29 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:29 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:30 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:30 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:28 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:28 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[2:3], v62 offset:18 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:24 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:18 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[2:3], v76 offset:17 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:19 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:17 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:23 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:23 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:21 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:21 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:22 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:22 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[2:3], v104 offset:16 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:20 +; ALIGNED-NEXT: flat_store_byte v[0:1], v94 offset:16 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:664 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:668 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:660 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 -; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:10 -; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:11 -; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:13 -; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:9 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:656 +; ALIGNED-NEXT: flat_store_byte v[0:1], v111 offset:10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v121 offset:11 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:15 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:13 +; ALIGNED-NEXT: flat_store_byte v[0:1], v120 offset:9 +; ALIGNED-NEXT: flat_store_byte v[0:1], v126 offset:15 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:14 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:14 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:12 -; ALIGNED-NEXT: flat_store_byte v[2:3], v122 offset:8 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:12 +; ALIGNED-NEXT: flat_store_byte v[0:1], v123 offset:8 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:2 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:2 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:3 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:3 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:1 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:1 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:7 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:7 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:5 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:5 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:6 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:6 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:4 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:4 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 -; ALIGNED-NEXT: s_cbranch_scc0 .LBB9_4 -; ALIGNED-NEXT: .LBB9_5: ; %Flow11 -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 +; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffff00, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; ALIGNED-NEXT: s_cbranch_scc0 .LBB9_5 +; ALIGNED-NEXT: .LBB9_6: ; %Flow17 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; ALIGNED-NEXT: s_clause 0x2f ; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 @@ -15844,42 +15892,44 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3: ; %bb.0: ; %entry ; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 -; UNROLL3-NEXT: s_mov_b32 s6, exec_lo +; UNROLL3-NEXT: s_mov_b32 s4, exec_lo ; UNROLL3-NEXT: v_cndmask_b32_e32 v3, -1, v0, vcc_lo ; UNROLL3-NEXT: v_cmpx_ge_u32_e64 v2, v3 -; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s6 +; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s4 ; UNROLL3-NEXT: s_cbranch_execz .LBB9_4 ; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; UNROLL3-NEXT: v_mov_b32_e32 v3, v2 +; UNROLL3-NEXT: v_mov_b32_e32 v4, v1 +; UNROLL3-NEXT: v_mov_b32_e32 v3, v0 +; UNROLL3-NEXT: v_mov_b32_e32 v5, v2 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7e0 ; UNROLL3-NEXT: s_inst_prefetch 0x1 ; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB9_2: ; %memmove_fwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: s_clause 0xb -; UNROLL3-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen -; UNROLL3-NEXT: buffer_load_dword v5, v3, s[0:3], 0 offen offset:4 -; UNROLL3-NEXT: buffer_load_dword v6, v3, s[0:3], 0 offen offset:8 -; UNROLL3-NEXT: buffer_load_dword v7, v3, s[0:3], 0 offen offset:12 -; UNROLL3-NEXT: buffer_load_dword v8, v3, s[0:3], 0 offen offset:16 -; UNROLL3-NEXT: buffer_load_dword v9, v3, s[0:3], 0 offen offset:20 -; UNROLL3-NEXT: buffer_load_dword v10, v3, s[0:3], 0 offen offset:24 -; UNROLL3-NEXT: buffer_load_dword v11, v3, s[0:3], 0 offen offset:28 -; UNROLL3-NEXT: buffer_load_dword v12, v3, s[0:3], 0 offen offset:32 -; UNROLL3-NEXT: buffer_load_dword v13, v3, s[0:3], 0 offen offset:36 -; UNROLL3-NEXT: buffer_load_dword v14, v3, s[0:3], 0 offen offset:40 -; UNROLL3-NEXT: buffer_load_dword v15, v3, s[0:3], 0 offen offset:44 -; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo -; UNROLL3-NEXT: s_add_u32 s4, s4, 48 -; UNROLL3-NEXT: v_add_nc_u32_e32 v3, 48, v3 -; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: buffer_load_dword v6, v5, s[0:3], 0 offen +; UNROLL3-NEXT: buffer_load_dword v7, v5, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: buffer_load_dword v8, v5, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: buffer_load_dword v9, v5, s[0:3], 0 offen offset:12 +; UNROLL3-NEXT: buffer_load_dword v10, v5, s[0:3], 0 offen offset:16 +; UNROLL3-NEXT: buffer_load_dword v11, v5, s[0:3], 0 offen offset:20 +; UNROLL3-NEXT: buffer_load_dword v12, v5, s[0:3], 0 offen offset:24 +; UNROLL3-NEXT: buffer_load_dword v13, v5, s[0:3], 0 offen offset:28 +; UNROLL3-NEXT: buffer_load_dword v14, v5, s[0:3], 0 offen offset:32 +; UNROLL3-NEXT: buffer_load_dword v15, v5, s[0:3], 0 offen offset:36 +; UNROLL3-NEXT: buffer_load_dword v16, v5, s[0:3], 0 offen offset:40 +; UNROLL3-NEXT: buffer_load_dword v17, v5, s[0:3], 0 offen offset:44 +; UNROLL3-NEXT: v_add_nc_u32_e32 v5, 48, v5 +; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 +; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 ; UNROLL3-NEXT: s_waitcnt vmcnt(4) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] offset:16 -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[10:13] offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[6:9] ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 -; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0x7e0 +; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[14:17] offset:32 +; UNROLL3-NEXT: v_add_co_u32 v3, vcc_lo, v3, 48 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v4, vcc_lo +; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc1 .LBB9_2 ; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual ; UNROLL3-NEXT: s_inst_prefetch 0x2 @@ -15898,9 +15948,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: ; implicit-def: $vgpr2 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:2032 -; UNROLL3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; UNROLL3-NEXT: .LBB9_4: ; %Flow8 -; UNROLL3-NEXT: s_andn2_saveexec_b32 s8, s6 +; UNROLL3-NEXT: ; implicit-def: $vgpr0 +; UNROLL3-NEXT: .LBB9_4: ; %Flow14 +; UNROLL3-NEXT: s_andn2_saveexec_b32 s6, s6 ; UNROLL3-NEXT: s_cbranch_execz .LBB9_7 ; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual ; UNROLL3-NEXT: s_clause 0x3 @@ -15908,51 +15958,51 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:2036 ; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:2040 ; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:2044 -; UNROLL3-NEXT: s_movk_i32 s6, 0xffd0 -; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7b0 -; UNROLL3-NEXT: s_mov_b32 s7, -1 +; UNROLL3-NEXT: s_movk_i32 s4, 0xf820 +; UNROLL3-NEXT: s_mov_b32 s5, -1 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:2032 ; UNROLL3-NEXT: s_clause 0x3 -; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:2016 -; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:2020 -; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:2024 -; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:2028 -; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v2 +; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:2016 +; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:2020 +; UNROLL3-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:2024 +; UNROLL3-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:2028 +; UNROLL3-NEXT: v_add_co_u32 v3, vcc_lo, 0x7b0, v0 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:2016 +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[5:8] offset:2016 ; UNROLL3-NEXT: s_inst_prefetch 0x1 ; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB9_6: ; %memmove_bwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: s_clause 0xb -; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 -; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 -; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; UNROLL3-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 -; UNROLL3-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 -; UNROLL3-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; UNROLL3-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 -; UNROLL3-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:32 -; UNROLL3-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:36 -; UNROLL3-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:40 -; UNROLL3-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:44 -; UNROLL3-NEXT: v_add_co_u32 v15, vcc_lo, v0, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v16, null, s5, v1, vcc_lo +; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:1968 +; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:1972 +; UNROLL3-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:1976 +; UNROLL3-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:1980 +; UNROLL3-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:1984 +; UNROLL3-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:1988 +; UNROLL3-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:1992 +; UNROLL3-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:1996 +; UNROLL3-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:2000 +; UNROLL3-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:2004 +; UNROLL3-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:2008 +; UNROLL3-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:2012 ; UNROLL3-NEXT: v_subrev_nc_u32_e32 v2, 48, v2 -; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 -; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 ; UNROLL3-NEXT: s_waitcnt vmcnt(4) -; UNROLL3-NEXT: flat_store_dwordx4 v[15:16], v[7:10] offset:16 -; UNROLL3-NEXT: flat_store_dwordx4 v[15:16], v[3:6] +; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[9:12] offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[5:8] ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: flat_store_dwordx4 v[15:16], v[11:14] offset:32 -; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[13:16] offset:32 +; UNROLL3-NEXT: v_add_co_u32 v3, vcc_lo, 0xffffffd0, v3 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v4, vcc_lo +; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc0 .LBB9_6 -; UNROLL3-NEXT: .LBB9_7: ; %Flow9 +; UNROLL3-NEXT: .LBB9_7: ; %Flow15 ; UNROLL3-NEXT: s_inst_prefetch 0x2 -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) ; UNROLL3-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-invalid-ptr-extend.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-invalid-ptr-extend.ll index 61c1fd6fbb198..0a7102a14a83a 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-invalid-ptr-extend.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-invalid-ptr-extend.ll @@ -12,22 +12,20 @@ define amdgpu_kernel void @scaledregtest() local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: loopexit: -; CHECK-NEXT: [[SCEVGEP11_LCSSA:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP11:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SCEVGEP13_LCSSA:%.*]] = phi ptr [ [[SCEVGEP13:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: br label [[FOR_BODY_1:%.*]] ; CHECK: for.body.1: -; CHECK-NEXT: [[LSR_IV5:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP6:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP11_LCSSA]], [[LOOPEXIT:%.*]] ] -; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP13_LCSSA]], [[LOOPEXIT]] ] +; CHECK-NEXT: [[LSR_IV5:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP6:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP11_LCSSA:%.*]], [[LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP13_LCSSA:%.*]], [[LOOPEXIT]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[LSR_IV5]], align 8 ; CHECK-NEXT: store ptr [[TMP0]], ptr [[LSR_IV1]], align 8 ; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 8 ; CHECK-NEXT: [[SCEVGEP6]] = getelementptr i8, ptr addrspace(5) [[LSR_IV5]], i32 8 ; CHECK-NEXT: br label [[FOR_BODY_1]] ; CHECK: for.body: -; CHECK-NEXT: [[LSR_IV12:%.*]] = phi ptr [ [[SCEVGEP13]], [[FOR_BODY]] ], [ null, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[LSR_IV10:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP11]], [[FOR_BODY]] ], [ null, [[ENTRY]] ] -; CHECK-NEXT: [[SCEVGEP11]] = getelementptr i8, ptr addrspace(5) [[LSR_IV10]], i32 64 -; CHECK-NEXT: [[SCEVGEP13]] = getelementptr i8, ptr [[LSR_IV12]], i64 64 +; CHECK-NEXT: [[SCEVGEP11_LCSSA]] = phi ptr addrspace(5) [ [[SCEVGEP4:%.*]], [[FOR_BODY]] ], [ inttoptr (i32 64 to ptr addrspace(5)), [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SCEVGEP13_LCSSA]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ inttoptr (i64 64 to ptr), [[ENTRY]] ] +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[SCEVGEP13_LCSSA]], i64 64 +; CHECK-NEXT: [[SCEVGEP4]] = getelementptr i8, ptr addrspace(5) [[SCEVGEP11_LCSSA]], i32 64 ; CHECK-NEXT: br i1 false, label [[LOOPEXIT]], label [[FOR_BODY]] ; entry: @@ -58,7 +56,7 @@ for.body: define protected amdgpu_kernel void @baseregtest(i32 %n, i32 %lda, i1 %arg) local_unnamed_addr { ; CHECK-LABEL: @baseregtest( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 %arg, label [[EXIT:%.*]], label [[IF_END:%.*]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[EXIT:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: ; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @foo() ; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 3 diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll index 78c2d99e830fa..2bd4d42b8ac02 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @lsr_crash_preserve_addrspace_unknown_type() #0 { ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: [[TMP:%.*]] = phi ptr addrspace(3) [ undef, %[[BB]] ], [ [[TMP18:%.*]], %[[BB17:.*]] ] -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP]], i32 8 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr inbounds [[TMP0:%.*]], ptr addrspace(3) [[TMP]], i64 0, i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(3) [[SCEVGEP1]], align 8 ; CHECK-NEXT: br label %[[BB4:.*]] ; CHECK: [[BB4]]: @@ -26,14 +26,14 @@ define amdgpu_kernel void @lsr_crash_preserve_addrspace_unknown_type() #0 { ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 0, [[TMP10]] ; CHECK-NEXT: br i1 [[TMP11]], label %[[BB12:.*]], label %[[BB17]] ; CHECK: [[BB12]]: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP]], i32 16 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr inbounds [[TMP0]], ptr addrspace(3) [[TMP]], i64 0, i32 2 ; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(3) [[SCEVGEP]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 0, [[TMP14]] ; CHECK-NEXT: br i1 [[TMP15]], label %[[BB16:.*]], label %[[BB17]] ; CHECK: [[BB16]]: ; CHECK-NEXT: unreachable ; CHECK: [[BB17]]: -; CHECK-NEXT: [[TMP18]] = getelementptr inbounds [[TMP0:%.*]], ptr addrspace(3) [[TMP]], i64 2 +; CHECK-NEXT: [[TMP18]] = getelementptr inbounds [[TMP0]], ptr addrspace(3) [[TMP]], i64 2 ; CHECK-NEXT: br label %[[BB1]] ; bb: