From cf70c360bf25e801156cf35bb748b4073099aa7a Mon Sep 17 00:00:00 2001 From: Michael Selehov Date: Mon, 6 Oct 2025 08:22:11 -0500 Subject: [PATCH 1/6] [AMDGPU] TTI: Prioritize per-iter base adds in LSR plan comparison MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LSR on AMDGPU was selecting plans that reduce Setup/AddRec cost while increasing per-iteration base additions (`NumBaseAdds`) in the loop body.With minimal addressing modes, each extra `add i32` becomes a separate VALU op and hurts throughput (observed on gfx942). Teach GCNTTIImpl to compare LSR plans with `NumBaseAdds` and then `Insns` as dominant criteria, ahead of preheader costs (`SetupCost`, `AddRecCost`). Also return false from isNumRegsMajorCostOfLSR() and true from shouldDropLSRSolutionIfLessProfitable(). This steers LSR away from “-setup +more loop adds” trades; the reduced repro now keeps baseline and a large kernel regains performance. --- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 27 +++++++++++++++++++ .../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 5 ++++ 2 files changed, 32 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 03d16fdd54c42..1c151c010ac92 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1574,3 +1574,30 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const { } return BaseT::getNumberOfParts(Tp); } + +bool GCNTTIImpl::isLSRCostLess(const TTI::LSRCost &A, + const TTI::LSRCost &B) const { + // For AMDGPU (no powerful addressing modes), per-iter base adds are expensive. + auto key = [](const TTI::LSRCost &C) { + // Lexicographic priority: minimize per-iter adds first. + return std::tuple{ + C.NumBaseAdds, // dominate + C.Insns, // rough per-iter body cost + C.SetupCost, // preheader cost (cheaper for us) + C.AddRecCost, + C.ImmCost, + C.ScaleCost, + C.NumIVMuls, + C.NumRegs + }; + }; + return key(A) < key(B); +} + +bool GCNTTIImpl::isNumRegsMajorCostOfLSR() { + return false; +} + +bool GCNTTIImpl::shouldDropLSRSolutionIfLessProfitable() const { + return true; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 20da8344c9d37..d0929011c50c7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -302,6 +302,11 @@ class GCNTTIImpl final : public BasicTTIImplBase { /// together under a single i32 value. Otherwise fall back to base /// implementation. unsigned getNumberOfParts(Type *Tp) const override; + + bool isLSRCostLess(const TTI::LSRCost &A, + const TTI::LSRCost &B) const; + bool isNumRegsMajorCostOfLSR(); + bool shouldDropLSRSolutionIfLessProfitable() const; }; } // end namespace llvm From 55bf79116523ba8fe90835400b7b2c21d8818728 Mon Sep 17 00:00:00 2001 From: Michael Selehov Date: Mon, 6 Oct 2025 11:09:40 -0500 Subject: [PATCH 2/6] some lit fixes - Fixed a few lit-tests by limiting to GFX9 and above - Regenerated no-crash tests --- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 39 +++-- .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll | 143 ++++++++---------- .../AMDGPU/preserve-addrspace-assert.ll | 6 +- 3 files changed, 90 insertions(+), 98 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 1c151c010ac92..db999eb1f1ae9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -16,6 +16,7 @@ #include "AMDGPUTargetTransformInfo.h" #include "AMDGPUTargetMachine.h" +#include "AMDGPUSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIModeRegisterDefaults.h" #include "llvm/Analysis/InlineCost.h" @@ -1578,20 +1579,30 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const { bool GCNTTIImpl::isLSRCostLess(const TTI::LSRCost &A, const TTI::LSRCost &B) const { // For AMDGPU (no powerful addressing modes), per-iter base adds are expensive. - auto key = [](const TTI::LSRCost &C) { - // Lexicographic priority: minimize per-iter adds first. - return std::tuple{ - C.NumBaseAdds, // dominate - C.Insns, // rough per-iter body cost - C.SetupCost, // preheader cost (cheaper for us) - C.AddRecCost, - C.ImmCost, - C.ScaleCost, - C.NumIVMuls, - C.NumRegs - }; - }; - return key(A) < key(B); + const GCNSubtarget &ST = *static_cast(getST()); + + // Limit the aggressive GPU-centric ordering to GFX9+ only. + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS + 1 /* GFX9 */) { + // GFX9+ (gfx90/940/942,...): prioritize per-iter work over preheader. + if (A.NumBaseAdds != B.NumBaseAdds) + return A.NumBaseAdds < B.NumBaseAdds; + if (A.Insns != B.Insns) + return A.Insns < B.Insns; + // Only if per-iter ties, consider preheader-related costs. + if (A.AddRecCost != B.AddRecCost) + return A.AddRecCost < B.AddRecCost; + if (A.SetupCost != B.SetupCost) + return A.SetupCost < B.SetupCost; + // Fall back to minor keys to keep total order stable. + if (A.ScaleCost != B.ScaleCost) + return A.ScaleCost < B.ScaleCost; + if (A.NumIVMuls != B.NumIVMuls) + return A.NumIVMuls < B.NumIVMuls; + return A.NumRegs < B.NumRegs; + } + + // Pre-GFX9: keep the default behavior (don’t perturb bonaire/VI tests). + return BaseT::isLSRCostLess(A, B); } bool GCNTTIImpl::isNumRegsMajorCostOfLSR() { diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index ca4f5d22ca9a0..647e730c8f51f 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -7,138 +7,119 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) { ; CHECK-LABEL: issue63986: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshlrev_b64 v[4:5], 6, v[2:3] -; CHECK-NEXT: v_mov_b32_e32 v6, s17 -; CHECK-NEXT: v_add_co_u32_e32 v8, vcc, s16, v4 -; CHECK-NEXT: v_addc_co_u32_e32 v9, vcc, v6, v5, vcc +; CHECK-NEXT: v_lshlrev_b64 v[2:3], 6, v[2:3] +; CHECK-NEXT: v_mov_b32_e32 v5, s17 +; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, s16, v2 +; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v3, vcc ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: .LBB0_1: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v7, s5 -; CHECK-NEXT: v_mov_b32_e32 v6, s4 -; CHECK-NEXT: flat_load_dwordx4 v[10:13], v[6:7] -; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, s4, v8 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-NEXT: v_mov_b32_e32 v3, s5 +; CHECK-NEXT: flat_load_dwordx4 v[6:9], v[2:3] +; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s4, v4 ; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_cmp_ge_u64_e64 s[6:7], s[4:5], 32 -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc +; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc ; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[6:7], v[10:13] +; CHECK-NEXT: flat_store_dwordx4 v[2:3], v[6:9] ; CHECK-NEXT: s_cbranch_vccz .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %loop-memcpy-residual-header -; CHECK-NEXT: s_branch .LBB0_4 -; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 -; CHECK-NEXT: s_branch .LBB0_5 -; CHECK-NEXT: .LBB0_4: ; %loop-memcpy-residual-header.post-loop-memcpy-expansion_crit_edge -; CHECK-NEXT: v_lshlrev_b64 v[6:7], 6, v[2:3] -; CHECK-NEXT: s_cbranch_execnz .LBB0_8 -; CHECK-NEXT: .LBB0_5: ; %loop-memcpy-residual.preheader -; CHECK-NEXT: s_add_u32 s4, s16, 32 -; CHECK-NEXT: s_addc_u32 s5, s17, 0 -; CHECK-NEXT: v_mov_b32_e32 v3, s5 -; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s4, v4 -; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; CHECK-NEXT: s_cbranch_execnz .LBB0_5 +; CHECK-NEXT: ; %bb.3: ; %loop-memcpy-residual.preheader ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: ; %bb.6: ; %loop-memcpy-residual +; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual ; CHECK-NEXT: s_add_u32 s6, 32, s4 ; CHECK-NEXT: s_addc_u32 s7, 0, s5 -; CHECK-NEXT: v_mov_b32_e32 v6, s6 -; CHECK-NEXT: v_mov_b32_e32 v7, s7 -; CHECK-NEXT: flat_load_ubyte v10, v[6:7] -; CHECK-NEXT: v_mov_b32_e32 v7, s5 -; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, s4, v2 -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v3, v7, vcc +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: flat_load_ubyte v6, v[2:3] +; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s6, v4 ; CHECK-NEXT: s_add_u32 s4, s4, 1 +; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc ; CHECK-NEXT: s_addc_u32 s5, 0, s5 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[6:7], v10 -; CHECK-NEXT: ; %bb.7: -; CHECK-NEXT: v_mov_b32_e32 v7, v5 -; CHECK-NEXT: v_mov_b32_e32 v6, v4 -; CHECK-NEXT: .LBB0_8: ; %post-loop-memcpy-expansion +; CHECK-NEXT: flat_store_byte v[2:3], v6 +; CHECK-NEXT: .LBB0_5: ; %post-loop-memcpy-expansion ; CHECK-NEXT: v_and_b32_e32 v2, 15, v0 -; CHECK-NEXT: v_and_b32_e32 v0, -16, v0 -; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, v6, v0 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v1, vcc +; CHECK-NEXT: v_and_b32_e32 v0, -16, v0 ; CHECK-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[0:1] ; CHECK-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[2:3] -; CHECK-NEXT: v_mov_b32_e32 v6, s17 -; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, s16, v4 -; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc -; CHECK-NEXT: s_branch .LBB0_11 -; CHECK-NEXT: .LBB0_9: ; %Flow14 -; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 +; CHECK-NEXT: s_branch .LBB0_8 +; CHECK-NEXT: .LBB0_6: ; %Flow7 +; CHECK-NEXT: ; in Loop: Header=BB0_8 Depth=1 ; CHECK-NEXT: s_or_b64 exec, exec, s[10:11] ; CHECK-NEXT: s_mov_b64 s[8:9], 0 -; CHECK-NEXT: .LBB0_10: ; %Flow16 -; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 +; CHECK-NEXT: .LBB0_7: ; %Flow9 +; CHECK-NEXT: ; in Loop: Header=BB0_8 Depth=1 ; CHECK-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; CHECK-NEXT: s_cbranch_vccz .LBB0_19 -; CHECK-NEXT: .LBB0_11: ; %while.cond +; CHECK-NEXT: s_cbranch_vccz .LBB0_16 +; CHECK-NEXT: .LBB0_8: ; %while.cond ; CHECK-NEXT: ; =>This Loop Header: Depth=1 -; CHECK-NEXT: ; Child Loop BB0_13 Depth 2 -; CHECK-NEXT: ; Child Loop BB0_17 Depth 2 +; CHECK-NEXT: ; Child Loop BB0_10 Depth 2 +; CHECK-NEXT: ; Child Loop BB0_14 Depth 2 ; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_14 -; CHECK-NEXT: ; %bb.12: ; %loop-memcpy-expansion2.preheader -; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 +; CHECK-NEXT: s_cbranch_execz .LBB0_11 +; CHECK-NEXT: ; %bb.9: ; %loop-memcpy-expansion2.preheader +; CHECK-NEXT: ; in Loop: Header=BB0_8 Depth=1 ; CHECK-NEXT: s_mov_b64 s[10:11], 0 ; CHECK-NEXT: s_mov_b64 s[12:13], 0 -; CHECK-NEXT: .LBB0_13: ; %loop-memcpy-expansion2 -; CHECK-NEXT: ; Parent Loop BB0_11 Depth=1 +; CHECK-NEXT: .LBB0_10: ; %loop-memcpy-expansion2 +; CHECK-NEXT: ; Parent Loop BB0_8 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v6, s12 ; CHECK-NEXT: v_mov_b32_e32 v7, s13 -; CHECK-NEXT: flat_load_dwordx4 v[10:13], v[6:7] -; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, s12, v8 +; CHECK-NEXT: flat_load_dwordx4 v[6:9], v[6:7] +; CHECK-NEXT: v_mov_b32_e32 v11, s13 +; CHECK-NEXT: v_add_co_u32_e32 v10, vcc, s12, v4 ; CHECK-NEXT: s_add_u32 s12, s12, 16 -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc +; CHECK-NEXT: v_addc_co_u32_e32 v11, vcc, v5, v11, vcc ; CHECK-NEXT: s_addc_u32 s13, s13, 0 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc, s[12:13], v[0:1] ; CHECK-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[6:7], v[10:13] +; CHECK-NEXT: flat_store_dwordx4 v[10:11], v[6:9] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[10:11] -; CHECK-NEXT: s_cbranch_execnz .LBB0_13 -; CHECK-NEXT: .LBB0_14: ; %Flow15 -; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 +; CHECK-NEXT: s_cbranch_execnz .LBB0_10 +; CHECK-NEXT: .LBB0_11: ; %Flow8 +; CHECK-NEXT: ; in Loop: Header=BB0_8 Depth=1 ; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] ; CHECK-NEXT: s_mov_b64 s[8:9], -1 -; CHECK-NEXT: s_cbranch_execz .LBB0_10 -; CHECK-NEXT: ; %bb.15: ; %loop-memcpy-residual-header5 -; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 +; CHECK-NEXT: s_cbranch_execz .LBB0_7 +; CHECK-NEXT: ; %bb.12: ; %loop-memcpy-residual-header5 +; CHECK-NEXT: ; in Loop: Header=BB0_8 Depth=1 ; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] ; CHECK-NEXT: s_xor_b64 s[10:11], exec, s[8:9] -; CHECK-NEXT: s_cbranch_execz .LBB0_9 -; CHECK-NEXT: ; %bb.16: ; %loop-memcpy-residual4.preheader -; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 +; CHECK-NEXT: s_cbranch_execz .LBB0_6 +; CHECK-NEXT: ; %bb.13: ; %loop-memcpy-residual4.preheader +; CHECK-NEXT: ; in Loop: Header=BB0_8 Depth=1 ; CHECK-NEXT: s_mov_b64 s[12:13], 0 ; CHECK-NEXT: s_mov_b64 s[14:15], 0 -; CHECK-NEXT: .LBB0_17: ; %loop-memcpy-residual4 -; CHECK-NEXT: ; Parent Loop BB0_11 Depth=1 +; CHECK-NEXT: .LBB0_14: ; %loop-memcpy-residual4 +; CHECK-NEXT: ; Parent Loop BB0_8 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_mov_b32_e32 v10, s15 +; CHECK-NEXT: v_mov_b32_e32 v7, s15 ; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, s14, v0 -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v10, vcc -; CHECK-NEXT: flat_load_ubyte v11, v[6:7] -; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, s14, v4 +; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v7, vcc +; CHECK-NEXT: flat_load_ubyte v8, v[6:7] ; CHECK-NEXT: s_add_u32 s14, s14, 1 ; CHECK-NEXT: s_addc_u32 s15, s15, 0 ; CHECK-NEXT: v_cmp_ge_u64_e64 s[8:9], s[14:15], v[2:3] -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v5, v10, vcc +; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6 +; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v5, v7, vcc ; CHECK-NEXT: s_or_b64 s[12:13], s[8:9], s[12:13] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[6:7], v11 +; CHECK-NEXT: flat_store_byte v[6:7], v8 ; CHECK-NEXT: s_andn2_b64 exec, exec, s[12:13] -; CHECK-NEXT: s_cbranch_execnz .LBB0_17 -; CHECK-NEXT: ; %bb.18: ; %Flow -; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 +; CHECK-NEXT: s_cbranch_execnz .LBB0_14 +; CHECK-NEXT: ; %bb.15: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB0_8 Depth=1 ; CHECK-NEXT: s_or_b64 exec, exec, s[12:13] -; CHECK-NEXT: s_branch .LBB0_9 -; CHECK-NEXT: .LBB0_19: ; %DummyReturnBlock +; CHECK-NEXT: s_branch .LBB0_6 +; CHECK-NEXT: .LBB0_16: ; %DummyReturnBlock ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll index 78c2d99e830fa..2bd4d42b8ac02 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @lsr_crash_preserve_addrspace_unknown_type() #0 { ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: [[TMP:%.*]] = phi ptr addrspace(3) [ undef, %[[BB]] ], [ [[TMP18:%.*]], %[[BB17:.*]] ] -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP]], i32 8 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr inbounds [[TMP0:%.*]], ptr addrspace(3) [[TMP]], i64 0, i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(3) [[SCEVGEP1]], align 8 ; CHECK-NEXT: br label %[[BB4:.*]] ; CHECK: [[BB4]]: @@ -26,14 +26,14 @@ define amdgpu_kernel void @lsr_crash_preserve_addrspace_unknown_type() #0 { ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 0, [[TMP10]] ; CHECK-NEXT: br i1 [[TMP11]], label %[[BB12:.*]], label %[[BB17]] ; CHECK: [[BB12]]: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP]], i32 16 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr inbounds [[TMP0]], ptr addrspace(3) [[TMP]], i64 0, i32 2 ; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(3) [[SCEVGEP]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 0, [[TMP14]] ; CHECK-NEXT: br i1 [[TMP15]], label %[[BB16:.*]], label %[[BB17]] ; CHECK: [[BB16]]: ; CHECK-NEXT: unreachable ; CHECK: [[BB17]]: -; CHECK-NEXT: [[TMP18]] = getelementptr inbounds [[TMP0:%.*]], ptr addrspace(3) [[TMP]], i64 2 +; CHECK-NEXT: [[TMP18]] = getelementptr inbounds [[TMP0]], ptr addrspace(3) [[TMP]], i64 2 ; CHECK-NEXT: br label %[[BB1]] ; bb: From 1a3e81f7d12952e335932f049e95ff7849f3c382 Mon Sep 17 00:00:00 2001 From: Michael Selehov Date: Tue, 7 Oct 2025 10:41:57 -0500 Subject: [PATCH 3/6] =?UTF-8?q?Cost=20model:=20-=20Reorder=20GCNTTIImpl::i?= =?UTF-8?q?sLSRCostLess()=20for=20GFX9+=20to=20prioritize=20per-iteration?= =?UTF-8?q?=20work:=20=20=201)=20Insns=20(lower=20is=20better)=20=20=202)?= =?UTF-8?q?=20NumBaseAdds=20(tie-breaker=20to=20Insns)=20=20=203)=20NumIVM?= =?UTF-8?q?uls=20(penalize=20mul/mul=5Fhi/addc=20IV=20chains)=20=20=204)?= =?UTF-8?q?=20AddRecCost,=20then=20SetupCost=20(preheader=20costs=20only?= =?UTF-8?q?=20if=20per-iter=20ties)=20=20=205)=20Minor=20keys:=20ScaleCost?= =?UTF-8?q?,=20then=20NumRegs=20-=20Keep=20pre-GFX9=20behavior=20unchanged?= =?UTF-8?q?=20(fall=20back=20to=20BaseT::isLSRCostLess).=20-=20Rationale:?= =?UTF-8?q?=20AMDGPU=20lacks=20rich=20addressing=20modes;=20we=20must=20av?= =?UTF-8?q?oid=20LSR=20=E2=80=9Cwins=E2=80=9D=20that=20=20=20reduce=20setu?= =?UTF-8?q?p=20but=20add=20per-iteration=20base-adds=20/=20wide=20IV=20mul?= =?UTF-8?q?=20chains.=20This=20ordering=20=20=20consistently=20favors=20pl?= =?UTF-8?q?ans=20with=20less=20per-iteration=20work=20and=20reduces=20VALU?= =?UTF-8?q?=20pressure=20=20=20on=20gfx9+=20(e.g.,=20gfx942).=20In=20pract?= =?UTF-8?q?ice=20LSR=20now=20often=20drops=20its=20=E2=80=9Cimprovement?= =?UTF-8?q?=E2=80=9D=20in=20=20=20favor=20of=20the=20baseline=20plan=20(as?= =?UTF-8?q?=20seen=20in=20-debug-only=3Dlsr=20logs).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests: - Regenerate `llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll` checks (kernel `introduced_copy_to_sgpr`). Assembly changed in a beneficial way: fewer VALU ops in the hot loops and lower register usage: num_vgpr: 28 → 25 numbered_sgpr: 26 → 20 No functional change to the test’s intent (no RA asserts; copy/RA behavior intact). Checks updated via `utils/update_llc_test_checks.py`. --- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 35 +- .../AMDGPU/agpr-copy-no-free-registers.ll | 377 +++++++++--------- 2 files changed, 202 insertions(+), 210 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index db999eb1f1ae9..e6fdc212d85a9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1578,30 +1578,41 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const { bool GCNTTIImpl::isLSRCostLess(const TTI::LSRCost &A, const TTI::LSRCost &B) const { - // For AMDGPU (no powerful addressing modes), per-iter base adds are expensive. const GCNSubtarget &ST = *static_cast(getST()); - // Limit the aggressive GPU-centric ordering to GFX9+ only. - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS + 1 /* GFX9 */) { - // GFX9+ (gfx90/940/942,...): prioritize per-iter work over preheader. - if (A.NumBaseAdds != B.NumBaseAdds) - return A.NumBaseAdds < B.NumBaseAdds; - if (A.Insns != B.Insns) + // GFX9+: favor lower per-iteration work first; preheader/setup only as tie-breakers. + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS + 1) { + // 1) Total per-iteration instructions. This already includes base-adds, IV muls, etc. + if (A.Insns != B.Insns) { + // dbgs() << "MS: Insns different, A.Insns = " << A.Insns << ", B.Insns == " << B.Insns << "\n"; return A.Insns < B.Insns; - // Only if per-iter ties, consider preheader-related costs. + } + + // 2) Prefer fewer per-iteration base adds as a tie-breaker to Insns. + if (A.NumBaseAdds != B.NumBaseAdds) { + // dbgs() << "MS: NumBaseAdds different, A.NumBaseAdds = " << A.NumBaseAdds << ", B.NumBaseAdds == " << B.NumBaseAdds << "\n"; + return A.NumBaseAdds < B.NumBaseAdds; + } + + // 3) Strongly prefer fewer IV multiplications (mul/mul_hi/addc chains are costly on AMDGPU). + if (A.NumIVMuls != B.NumIVMuls) { + // dbgs() << "MS: NumIVMuls different, A.NumIVMuls = " << A.NumIVMuls << ", B.NumIVMuls == " << B.NumIVMuls << "\n"; + return A.NumIVMuls < B.NumIVMuls; + } + + // 4) Only if per-iteration work ties, consider preheader-related costs. if (A.AddRecCost != B.AddRecCost) return A.AddRecCost < B.AddRecCost; if (A.SetupCost != B.SetupCost) return A.SetupCost < B.SetupCost; - // Fall back to minor keys to keep total order stable. + + // 5) Minor keys to stabilize ordering. if (A.ScaleCost != B.ScaleCost) return A.ScaleCost < B.ScaleCost; - if (A.NumIVMuls != B.NumIVMuls) - return A.NumIVMuls < B.NumIVMuls; return A.NumRegs < B.NumRegs; } - // Pre-GFX9: keep the default behavior (don’t perturb bonaire/VI tests). + // Pre-GFX9: keep the default behavior. return BaseT::isLSRCostLess(A, B); } diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index a21db73cf3714..ff5391fc1f107 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -510,21 +510,20 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 { define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg2, i64 %arg3, <2 x half> %arg4, <2 x half> %arg5) #3 { ; GFX908-LABEL: introduced_copy_to_sgpr: ; GFX908: ; %bb.0: ; %bb -; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc +; GFX908-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX908-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX908-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10 ; GFX908-NEXT: s_load_dword s0, s[8:9], 0x18 -; GFX908-NEXT: s_mov_b32 s12, 0 -; GFX908-NEXT: s_mov_b32 s9, s12 +; GFX908-NEXT: s_mov_b32 s8, 0 +; GFX908-NEXT: v_mov_b32_e32 v16, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX908-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX908-NEXT: s_sub_i32 s1, 0, s7 -; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s0 -; GFX908-NEXT: v_mov_b32_e32 v17, 0 -; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX908-NEXT: v_readfirstlane_b32 s2, v0 +; GFX908-NEXT: v_cvt_f32_f16_e32 v14, s0 +; GFX908-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX908-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX908-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX908-NEXT: v_readfirstlane_b32 s2, v1 ; GFX908-NEXT: s_mul_i32 s1, s1, s2 ; GFX908-NEXT: s_mul_hi_u32 s1, s2, s1 ; GFX908-NEXT: s_add_i32 s2, s2, s1 @@ -533,164 +532,154 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_sub_i32 s2, s6, s2 ; GFX908-NEXT: s_add_i32 s3, s1, 1 ; GFX908-NEXT: s_sub_i32 s6, s2, s7 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_readfirstlane_b32 s9, v0 +; GFX908-NEXT: s_and_b32 s18, s9, 0xffff ; GFX908-NEXT: s_cmp_ge_u32 s2, s7 ; GFX908-NEXT: s_cselect_b32 s1, s3, s1 ; GFX908-NEXT: s_cselect_b32 s2, s6, s2 ; GFX908-NEXT: s_add_i32 s3, s1, 1 ; GFX908-NEXT: s_cmp_ge_u32 s2, s7 -; GFX908-NEXT: s_cselect_b32 s8, s3, s1 -; GFX908-NEXT: s_lshr_b32 s2, s0, 16 -; GFX908-NEXT: v_cvt_f32_f16_e32 v19, s2 -; GFX908-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 -; GFX908-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 +; GFX908-NEXT: s_cselect_b32 s19, s3, s1 +; GFX908-NEXT: s_lshr_b32 s0, s0, 16 +; GFX908-NEXT: v_cvt_f32_f16_e32 v15, s0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GFX908-NEXT: s_or_b32 s14, s14, 28 -; GFX908-NEXT: s_lshl_b64 s[16:17], s[8:9], 5 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s2, v16 -; GFX908-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX908-NEXT: s_mul_i32 s3, s5, s2 -; GFX908-NEXT: s_mul_hi_u32 s5, s4, s2 -; GFX908-NEXT: s_mul_i32 s2, s4, s2 -; GFX908-NEXT: s_add_i32 s3, s5, s3 -; GFX908-NEXT: s_lshl_b64 s[4:5], s[2:3], 5 ; GFX908-NEXT: s_branch .LBB3_2 -; GFX908-NEXT: .LBB3_1: ; %Flow20 +; GFX908-NEXT: .LBB3_1: ; %Flow10 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX908-NEXT: s_cbranch_vccz .LBB3_12 +; GFX908-NEXT: s_cbranch_vccz .LBB3_13 ; GFX908-NEXT: .LBB3_2: ; %bb9 ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB3_5 Depth 2 -; GFX908-NEXT: s_mov_b64 s[18:19], -1 +; GFX908-NEXT: s_mov_b64 s[6:7], -1 ; GFX908-NEXT: s_mov_b64 vcc, s[0:1] -; GFX908-NEXT: s_cbranch_vccz .LBB3_10 +; GFX908-NEXT: s_cbranch_vccz .LBB3_11 ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX908-NEXT: global_load_dwordx2 v[8:9], v[0:1], off ; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 -; GFX908-NEXT: s_mov_b32 s13, s12 -; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX908-NEXT: v_mov_b32_e32 v4, s12 -; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6 -; GFX908-NEXT: v_mov_b32_e32 v6, s12 -; GFX908-NEXT: v_mov_b32_e32 v8, s12 -; GFX908-NEXT: v_mov_b32_e32 v5, s13 -; GFX908-NEXT: v_mov_b32_e32 v7, s13 -; GFX908-NEXT: v_mov_b32_e32 v9, s13 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 -; GFX908-NEXT: v_mov_b32_e32 v11, v5 -; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15] -; GFX908-NEXT: v_mov_b32_e32 v10, v4 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[6:7], s[10:11], 0 +; GFX908-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[2:3] +; GFX908-NEXT: s_mov_b32 s9, s8 +; GFX908-NEXT: v_mov_b32_e32 v2, s8 +; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v4 +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, s9 +; GFX908-NEXT: v_mov_b32_e32 v5, s9 +; GFX908-NEXT: v_mov_b32_e32 v7, s9 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s9, v2 -; GFX908-NEXT: v_readfirstlane_b32 s13, v3 -; GFX908-NEXT: s_add_u32 s9, s9, 1 -; GFX908-NEXT: s_addc_u32 s13, s13, 0 -; GFX908-NEXT: s_mul_hi_u32 s22, s6, s9 -; GFX908-NEXT: s_mul_i32 s13, s6, s13 -; GFX908-NEXT: s_mul_i32 s23, s7, s9 -; GFX908-NEXT: s_add_i32 s13, s22, s13 -; GFX908-NEXT: s_mul_i32 s9, s6, s9 -; GFX908-NEXT: s_add_i32 s13, s13, s23 +; GFX908-NEXT: v_readfirstlane_b32 s12, v8 +; GFX908-NEXT: v_readfirstlane_b32 s13, v9 +; GFX908-NEXT: v_mov_b32_e32 v8, s8 +; GFX908-NEXT: v_mov_b32_e32 v9, s9 ; GFX908-NEXT: s_branch .LBB3_5 -; GFX908-NEXT: .LBB3_4: ; %bb58 +; GFX908-NEXT: .LBB3_4: ; %Flow8 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX908-NEXT: s_add_u32 s20, s20, s4 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3] -; GFX908-NEXT: s_addc_u32 s21, s21, s5 -; GFX908-NEXT: s_mov_b64 s[22:23], 0 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25] -; GFX908-NEXT: s_cbranch_vccz .LBB3_9 +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GFX908-NEXT: s_cbranch_vccz .LBB3_10 ; GFX908-NEXT: .LBB3_5: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: s_add_u32 s22, s20, s9 -; GFX908-NEXT: s_addc_u32 s23, s21, s13 -; GFX908-NEXT: global_load_dword v21, v17, s[22:23] offset:-12 glc +; GFX908-NEXT: s_add_u32 s9, s12, 1 +; GFX908-NEXT: s_addc_u32 s14, s13, 0 +; GFX908-NEXT: s_mul_i32 s15, s9, s5 +; GFX908-NEXT: s_mul_hi_u32 s16, s9, s4 +; GFX908-NEXT: s_add_i32 s15, s16, s15 +; GFX908-NEXT: s_mul_i32 s14, s14, s4 +; GFX908-NEXT: s_add_i32 s15, s15, s14 +; GFX908-NEXT: s_mul_i32 s9, s9, s4 +; GFX908-NEXT: s_add_u32 s14, s9, s10 +; GFX908-NEXT: s_addc_u32 s15, s15, s11 +; GFX908-NEXT: s_lshl_b64 s[14:15], s[14:15], 5 +; GFX908-NEXT: global_load_dword v18, v16, s[14:15] offset:16 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v20, v17, s[22:23] offset:-8 glc +; GFX908-NEXT: global_load_dword v17, v16, s[14:15] offset:20 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v17, s[22:23] offset:-4 glc +; GFX908-NEXT: global_load_dword v10, v16, s[14:15] offset:24 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v17, s[22:23] glc +; GFX908-NEXT: global_load_dword v10, v16, s[14:15] offset:28 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: ds_read_b64 v[12:13], v17 -; GFX908-NEXT: ds_read_b64 v[14:15], v0 +; GFX908-NEXT: ds_read_b64 v[10:11], v16 +; GFX908-NEXT: ds_read_b64 v[12:13], v0 ; GFX908-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX908-NEXT: ; kill: killed $sgpr14_sgpr15 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX908-NEXT: ; %bb.6: ; %bb51 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: v_cvt_f32_f16_sdwa v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX908-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GFX908-NEXT: v_cvt_f32_f16_sdwa v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX908-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GFX908-NEXT: v_add_f32_e32 v24, v18, v12 -; GFX908-NEXT: v_add_f32_e32 v25, v19, v13 -; GFX908-NEXT: v_add_f32_e32 v26, 0, v12 -; GFX908-NEXT: v_add_f32_e32 v27, 0, v13 -; GFX908-NEXT: v_add_f32_e32 v15, v22, v15 -; GFX908-NEXT: v_add_f32_e32 v14, v21, v14 -; GFX908-NEXT: v_add_f32_e32 v13, v23, v13 -; GFX908-NEXT: v_add_f32_e32 v12, v20, v12 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v25 -; GFX908-NEXT: v_add_f32_e32 v4, v4, v24 -; GFX908-NEXT: v_add_f32_e32 v7, v7, v27 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v26 -; GFX908-NEXT: v_add_f32_e32 v8, v8, v14 -; GFX908-NEXT: v_add_f32_e32 v9, v9, v15 -; GFX908-NEXT: v_add_f32_e32 v10, v10, v12 -; GFX908-NEXT: v_add_f32_e32 v11, v11, v13 -; GFX908-NEXT: s_branch .LBB3_4 +; GFX908-NEXT: v_cvt_f32_f16_sdwa v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX908-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX908-NEXT: v_cvt_f32_f16_sdwa v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX908-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GFX908-NEXT: v_add_f32_e32 v21, v14, v10 +; GFX908-NEXT: v_add_f32_e32 v22, v15, v11 +; GFX908-NEXT: v_add_f32_e32 v23, 0, v10 +; GFX908-NEXT: v_add_f32_e32 v24, 0, v11 +; GFX908-NEXT: v_add_f32_e32 v13, v19, v13 +; GFX908-NEXT: v_add_f32_e32 v12, v18, v12 +; GFX908-NEXT: v_add_f32_e32 v11, v20, v11 +; GFX908-NEXT: v_add_f32_e32 v10, v17, v10 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v22 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v21 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v24 +; GFX908-NEXT: v_add_f32_e32 v4, v4, v23 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v12 +; GFX908-NEXT: v_add_f32_e32 v7, v7, v13 +; GFX908-NEXT: v_add_f32_e32 v8, v8, v10 +; GFX908-NEXT: v_add_f32_e32 v9, v9, v11 +; GFX908-NEXT: s_mov_b64 s[16:17], -1 +; GFX908-NEXT: s_branch .LBB3_8 ; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: s_mov_b64 s[22:23], s[18:19] -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23] -; GFX908-NEXT: s_cbranch_vccz .LBB3_4 -; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_mov_b64 s[22:23], -1 -; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX908-NEXT: ; implicit-def: $sgpr20_sgpr21 -; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard +; GFX908-NEXT: s_mov_b64 s[16:17], s[6:7] +; GFX908-NEXT: .LBB3_8: ; %Flow +; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 +; GFX908-NEXT: s_mov_b64 s[14:15], -1 +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GFX908-NEXT: s_mov_b64 s[16:17], -1 +; GFX908-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX908-NEXT: ; %bb.9: ; %bb58 +; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 +; GFX908-NEXT: s_add_u32 s12, s12, s18 +; GFX908-NEXT: s_addc_u32 s13, s13, 0 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[12:13], -1 +; GFX908-NEXT: s_mov_b64 s[14:15], 0 +; GFX908-NEXT: s_branch .LBB3_4 +; GFX908-NEXT: .LBB3_10: ; %loop.exit.guard ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_xor_b64 s[18:19], s[22:23], -1 -; GFX908-NEXT: .LBB3_10: ; %Flow19 +; GFX908-NEXT: s_xor_b64 s[6:7], s[14:15], -1 +; GFX908-NEXT: .LBB3_11: ; %Flow9 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: s_mov_b64 s[2:3], -1 -; GFX908-NEXT: s_and_b64 vcc, exec, s[18:19] +; GFX908-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 -; GFX908-NEXT: ; %bb.11: ; %bb12 +; GFX908-NEXT: ; %bb.12: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_add_u32 s10, s10, s8 +; GFX908-NEXT: s_add_u32 s10, s10, s19 ; GFX908-NEXT: s_addc_u32 s11, s11, 0 -; GFX908-NEXT: s_add_u32 s14, s14, s16 -; GFX908-NEXT: s_addc_u32 s15, s15, s17 ; GFX908-NEXT: s_mov_b64 s[2:3], 0 ; GFX908-NEXT: s_branch .LBB3_1 -; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock +; GFX908-NEXT: .LBB3_13: ; %DummyReturnBlock ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: introduced_copy_to_sgpr: ; GFX90A: ; %bb.0: ; %bb -; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc +; GFX90A-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX90A-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10 ; GFX90A-NEXT: s_load_dword s0, s[8:9], 0x18 ; GFX90A-NEXT: s_mov_b32 s12, 0 -; GFX90A-NEXT: s_mov_b32 s9, s12 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX90A-NEXT: s_sub_i32 s1, 0, s7 -; GFX90A-NEXT: v_mov_b32_e32 v19, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v0 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 ; GFX90A-NEXT: s_mul_i32 s1, s1, s2 ; GFX90A-NEXT: s_mul_hi_u32 s1, s2, s1 @@ -700,132 +689,124 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_sub_i32 s2, s6, s2 ; GFX90A-NEXT: s_add_i32 s3, s1, 1 ; GFX90A-NEXT: s_sub_i32 s6, s2, s7 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 +; GFX90A-NEXT: s_and_b32 s18, s8, 0xffff ; GFX90A-NEXT: s_cmp_ge_u32 s2, s7 ; GFX90A-NEXT: s_cselect_b32 s1, s3, s1 ; GFX90A-NEXT: s_cselect_b32 s2, s6, s2 ; GFX90A-NEXT: s_add_i32 s3, s1, 1 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s7 -; GFX90A-NEXT: s_cselect_b32 s8, s3, s1 -; GFX90A-NEXT: s_lshr_b32 s2, s0, 16 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s2 -; GFX90A-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 -; GFX90A-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 +; GFX90A-NEXT: s_cselect_b32 s19, s3, s1 +; GFX90A-NEXT: s_lshr_b32 s1, s0, 16 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s1 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s0 ; GFX90A-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GFX90A-NEXT: s_or_b32 s14, s14, 28 -; GFX90A-NEXT: s_lshl_b64 s[16:17], s[8:9], 5 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s2, v18 -; GFX90A-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX90A-NEXT: s_mul_i32 s3, s5, s2 -; GFX90A-NEXT: s_mul_hi_u32 s5, s4, s2 -; GFX90A-NEXT: s_mul_i32 s2, s4, s2 -; GFX90A-NEXT: s_add_i32 s3, s5, s3 -; GFX90A-NEXT: s_lshl_b64 s[4:5], s[2:3], 5 ; GFX90A-NEXT: s_branch .LBB3_2 -; GFX90A-NEXT: .LBB3_1: ; %Flow20 +; GFX90A-NEXT: .LBB3_1: ; %Flow10 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_vccz .LBB3_12 +; GFX90A-NEXT: s_cbranch_vccz .LBB3_13 ; GFX90A-NEXT: .LBB3_2: ; %bb9 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB3_5 Depth 2 -; GFX90A-NEXT: s_mov_b64 s[18:19], -1 +; GFX90A-NEXT: s_mov_b64 s[6:7], -1 ; GFX90A-NEXT: s_mov_b64 vcc, s[0:1] -; GFX90A-NEXT: s_cbranch_vccz .LBB3_10 +; GFX90A-NEXT: s_cbranch_vccz .LBB3_11 ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[2:3], off ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 ; GFX90A-NEXT: s_mov_b32 s13, s12 -; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[2:3] +; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[6:7], s[10:11], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[12:13], s[12:13] op_sel:[0,1] +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v8 ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 -; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15] -; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s9, v4 -; GFX90A-NEXT: v_readfirstlane_b32 s13, v5 -; GFX90A-NEXT: s_add_u32 s9, s9, 1 -; GFX90A-NEXT: s_addc_u32 s13, s13, 0 -; GFX90A-NEXT: s_mul_hi_u32 s22, s6, s9 -; GFX90A-NEXT: s_mul_i32 s13, s6, s13 -; GFX90A-NEXT: s_mul_i32 s23, s7, s9 -; GFX90A-NEXT: s_add_i32 s13, s22, s13 -; GFX90A-NEXT: s_mul_i32 s9, s6, s9 -; GFX90A-NEXT: s_add_i32 s13, s13, s23 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v10 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v11 +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] ; GFX90A-NEXT: s_branch .LBB3_5 -; GFX90A-NEXT: .LBB3_4: ; %bb58 +; GFX90A-NEXT: .LBB3_4: ; %Flow8 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: s_add_u32 s20, s20, s4 -; GFX90A-NEXT: s_addc_u32 s21, s21, s5 -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5] -; GFX90A-NEXT: s_mov_b64 s[22:23], 0 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25] -; GFX90A-NEXT: s_cbranch_vccz .LBB3_9 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GFX90A-NEXT: s_cbranch_vccz .LBB3_10 ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: s_add_u32 s22, s20, s9 -; GFX90A-NEXT: s_addc_u32 s23, s21, s13 -; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc +; GFX90A-NEXT: s_add_u32 s13, s8, 1 +; GFX90A-NEXT: s_addc_u32 s14, s9, 0 +; GFX90A-NEXT: s_mul_i32 s15, s13, s5 +; GFX90A-NEXT: s_mul_hi_u32 s16, s13, s4 +; GFX90A-NEXT: s_add_i32 s15, s16, s15 +; GFX90A-NEXT: s_mul_i32 s14, s14, s4 +; GFX90A-NEXT: s_add_i32 s15, s15, s14 +; GFX90A-NEXT: s_mul_i32 s13, s13, s4 +; GFX90A-NEXT: s_add_u32 s14, s13, s10 +; GFX90A-NEXT: s_addc_u32 s15, s15, s11 +; GFX90A-NEXT: s_lshl_b64 s[14:15], s[14:15], 5 +; GFX90A-NEXT: global_load_dword v18, v16, s[14:15] offset:16 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc +; GFX90A-NEXT: global_load_dword v17, v16, s[14:15] offset:20 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] offset:-4 glc +; GFX90A-NEXT: global_load_dword v12, v16, s[14:15] offset:24 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] glc +; GFX90A-NEXT: global_load_dword v12, v16, s[14:15] offset:28 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: ds_read_b64 v[14:15], v19 -; GFX90A-NEXT: ds_read_b64 v[16:17], v0 +; GFX90A-NEXT: ds_read_b64 v[12:13], v16 +; GFX90A-NEXT: ds_read_b64 v[14:15], v0 ; GFX90A-NEXT: s_and_b64 vcc, exec, s[2:3] -; GFX90A-NEXT: ; kill: killed $sgpr22 killed $sgpr23 +; GFX90A-NEXT: ; kill: killed $sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX90A-NEXT: ; %bb.6: ; %bb51 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: v_cvt_f32_f16_sdwa v23, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v22, v21 -; GFX90A-NEXT: v_cvt_f32_f16_sdwa v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GFX90A-NEXT: v_pk_add_f32 v[24:25], v[0:1], v[14:15] -; GFX90A-NEXT: v_pk_add_f32 v[26:27], v[14:15], 0 op_sel_hi:[1,0] -; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[22:23], v[16:17] -; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[20:21], v[14:15] +; GFX90A-NEXT: v_cvt_f32_f16_sdwa v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GFX90A-NEXT: v_cvt_f32_f16_sdwa v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v20, v17 +; GFX90A-NEXT: v_pk_add_f32 v[22:23], v[0:1], v[12:13] +; GFX90A-NEXT: v_pk_add_f32 v[24:25], v[12:13], 0 op_sel_hi:[1,0] +; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[18:19], v[14:15] +; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[20:21], v[12:13] +; GFX90A-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[22:23] ; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[24:25] -; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[26:27] -; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[16:17] -; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15] -; GFX90A-NEXT: s_branch .LBB3_4 +; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[14:15] +; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[12:13] +; GFX90A-NEXT: s_mov_b64 s[16:17], -1 +; GFX90A-NEXT: s_branch .LBB3_8 ; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: s_mov_b64 s[22:23], s[18:19] -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23] -; GFX90A-NEXT: s_cbranch_vccz .LBB3_4 -; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_mov_b64 s[22:23], -1 -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $sgpr20_sgpr21 -; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard +; GFX90A-NEXT: s_mov_b64 s[16:17], s[6:7] +; GFX90A-NEXT: .LBB3_8: ; %Flow +; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 +; GFX90A-NEXT: s_mov_b64 s[14:15], -1 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GFX90A-NEXT: s_mov_b64 s[16:17], -1 +; GFX90A-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX90A-NEXT: ; %bb.9: ; %bb58 +; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 +; GFX90A-NEXT: s_add_u32 s8, s8, s18 +; GFX90A-NEXT: s_addc_u32 s9, s9, 0 +; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[8:9], -1 +; GFX90A-NEXT: s_mov_b64 s[14:15], 0 +; GFX90A-NEXT: s_branch .LBB3_4 +; GFX90A-NEXT: .LBB3_10: ; %loop.exit.guard ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_xor_b64 s[18:19], s[22:23], -1 -; GFX90A-NEXT: .LBB3_10: ; %Flow19 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[14:15], -1 +; GFX90A-NEXT: .LBB3_11: ; %Flow9 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: s_mov_b64 s[2:3], -1 -; GFX90A-NEXT: s_and_b64 vcc, exec, s[18:19] +; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 -; GFX90A-NEXT: ; %bb.11: ; %bb12 +; GFX90A-NEXT: ; %bb.12: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_add_u32 s10, s10, s8 +; GFX90A-NEXT: s_add_u32 s10, s10, s19 ; GFX90A-NEXT: s_addc_u32 s11, s11, 0 -; GFX90A-NEXT: s_add_u32 s14, s14, s16 -; GFX90A-NEXT: s_addc_u32 s15, s15, s17 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: s_branch .LBB3_1 -; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock +; GFX90A-NEXT: .LBB3_13: ; %DummyReturnBlock ; GFX90A-NEXT: s_endpgm bb: %i = load volatile i16, ptr addrspace(4) poison, align 2 From d997a524970c5b418ec654c1a240e58e03fcb84d Mon Sep 17 00:00:00 2001 From: Michael Selehov Date: Thu, 9 Oct 2025 06:26:07 -0500 Subject: [PATCH 4/6] regen test --- .../GlobalISel/divergence-temporal-divergent-reg.ll | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll index d4e5487828c48..9a3525edc4b34 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll @@ -5,14 +5,14 @@ define void @temporal_divergent_i32(float %val, ptr %addr) { ; GFX10-LABEL: temporal_divergent_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, -1 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB0_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_add_i32 s5, s5, 1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s5 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: s_add_i32 s5, s5, 1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 @@ -40,14 +40,14 @@ define void @temporal_divergent_i32_multiple_use(float %val, ptr %addr, ptr %add ; GFX10-LABEL: temporal_divergent_i32_multiple_use: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s5, -1 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB1_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_add_i32 s5, s5, 1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v5, s5 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: s_add_i32 s5, s5, 1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB1_1 From b1d0879908dec7fe09725a9160862496ed538ce2 Mon Sep 17 00:00:00 2001 From: Michael Selehov Date: Thu, 9 Oct 2025 13:13:51 -0500 Subject: [PATCH 5/6] [AMDGPU] TTI: Reorder LSR cost priorities - prefer IV efficiency over base-add count Adjust the LSR cost comparison order for GFX9+ to prioritize per-iteration IV efficiency over minimizing base-adds. Previous order had NumBaseAdds checked early, causing the cost model to prefer baseline solutions with multiple IVs (NumBaseAdds=0) over optimized LSR solutions with fewer IVs but one base-add (NumBaseAdds=1). This resulted in extra register moves between IVs in the loop body, increasing VALU pressure. Changes to priority order for GFX9+: 1. Insns (total per-iteration instructions) 2. NumIVMuls (IV multiplication chains) <- moved above NumBaseAdds 3. AddRecCost (per-iteration IV updates) <- moved above NumBaseAdds 4. NumBaseAdds (base address adds) <- moved down 5. SetupCost (preheader costs) 6. ScaleCost, NumRegs (tie-breakers) Rationale: AddRecCost directly measures per-iteration IV update cost (fewer IVs = lower cost), which better reflects actual loop body work than NumBaseAdds alone. This fixes cases where baseline with 2 IVs generated more VALU instructions than LSR solution with 1 IV + base-add. --- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index e6fdc212d85a9..b78e279ebcafe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1584,31 +1584,40 @@ bool GCNTTIImpl::isLSRCostLess(const TTI::LSRCost &A, if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS + 1) { // 1) Total per-iteration instructions. This already includes base-adds, IV muls, etc. if (A.Insns != B.Insns) { - // dbgs() << "MS: Insns different, A.Insns = " << A.Insns << ", B.Insns == " << B.Insns << "\n"; + dbgs() << "MS: Insns different, A.Insns = " << A.Insns << ", B.Insns == " << B.Insns << "\n"; return A.Insns < B.Insns; } - // 2) Prefer fewer per-iteration base adds as a tie-breaker to Insns. - if (A.NumBaseAdds != B.NumBaseAdds) { - // dbgs() << "MS: NumBaseAdds different, A.NumBaseAdds = " << A.NumBaseAdds << ", B.NumBaseAdds == " << B.NumBaseAdds << "\n"; - return A.NumBaseAdds < B.NumBaseAdds; - } - - // 3) Strongly prefer fewer IV multiplications (mul/mul_hi/addc chains are costly on AMDGPU). + // 2) Strongly prefer fewer IV multiplications (mul/mul_hi/addc chains are costly on AMDGPU). if (A.NumIVMuls != B.NumIVMuls) { - // dbgs() << "MS: NumIVMuls different, A.NumIVMuls = " << A.NumIVMuls << ", B.NumIVMuls == " << B.NumIVMuls << "\n"; + dbgs() << "MS: NumIVMuls different, A.NumIVMuls = " << A.NumIVMuls << ", B.NumIVMuls == " << B.NumIVMuls << "\n"; return A.NumIVMuls < B.NumIVMuls; } - // 4) Only if per-iteration work ties, consider preheader-related costs. - if (A.AddRecCost != B.AddRecCost) + // 3) AddRecCost: per-iteration cost of IV updates (fewer IVs = lower cost). + if (A.AddRecCost != B.AddRecCost) { + dbgs() << "MS: AddRecCost different, A.AddRecCost = " << A.AddRecCost << ", B.AddRecCost == " << B.AddRecCost << "\n"; return A.AddRecCost < B.AddRecCost; - if (A.SetupCost != B.SetupCost) + } + + // 4) Prefer fewer per-iteration base adds as a tie-breaker. + if (A.NumBaseAdds != B.NumBaseAdds) { + dbgs() << "MS: NumBaseAdds different, A.NumBaseAdds = " << A.NumBaseAdds << ", B.NumBaseAdds == " << B.NumBaseAdds << "\n"; + return A.NumBaseAdds < B.NumBaseAdds; + } + + // 5) Preheader-related costs. + if (A.SetupCost != B.SetupCost) { + dbgs() << "MS: SetupCost different, A.SetupCost = " << A.SetupCost << ", B.SetupCost == " << B.SetupCost << "\n"; return A.SetupCost < B.SetupCost; + } - // 5) Minor keys to stabilize ordering. - if (A.ScaleCost != B.ScaleCost) + // 6) Minor keys to stabilize ordering. + if (A.ScaleCost != B.ScaleCost) { + dbgs() << "MS: ScaleCost different, A.ScaleCost = " << A.ScaleCost << ", B.ScaleCost == " << B.ScaleCost << "\n"; return A.ScaleCost < B.ScaleCost; + } + return A.NumRegs < B.NumRegs; } From b52274dc284d5fde9bdc055cfd591ee60345c709 Mon Sep 17 00:00:00 2001 From: Michael Selehov Date: Fri, 10 Oct 2025 11:04:31 -0500 Subject: [PATCH 6/6] [AMDGPU] TTI: Account for ScaleCost in LSR and improve cost model This patch refines the AMDGPU LSR cost model to better reflect the lack of rich addressing modes on the architecture. Key changes: 1. Implement getScalingFactorCost() to return 1 for base+scale*index addressing modes. While AMDGPU's isLegalAddressingMode() reports such modes as "legal", they require a separate ADD instruction unlike architectures with hardware-supported complex addressing. 2. Update isLSRCostLess() to use EffInsns = Insns + ScaleCost for the primary comparison. This ensures that LSR solutions using scaled addressing are properly penalized for the actual per-iteration work on AMDGPU. New priority order for GFX9+: 1. EffInsns (Insns + ScaleCost) - true per-iteration cost 2. NumIVMuls - IV multiplication chains 3. AddRecCost - IV update cost 4. NumBaseAdds - base address additions 5. SetupCost - preheader costs 6. ImmCost, NumRegs - tie-breakers Lit tests were carefully analyzed before regeneration to ensure: - No functional regressions (all test objectives preserved) - No incorrect machine sinking or divergence handling issues - Acceptable trade-offs (preheader overhead vs per-iteration efficiency) - No register spills or unnecessary barriers introduced --- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 44 +- .../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 5 + .../divergence-temporal-divergent-reg.ll | 8 +- llvm/test/CodeGen/AMDGPU/copy-to-reg.ll | 22 +- llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll | 66 +- llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 158 +- llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 322 +- ...p-var-out-of-divergent-loop-swdev407790.ll | 22 +- ...ne-sink-temporal-divergence-swdev407790.ll | 186 +- .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll | 143 +- .../CodeGen/AMDGPU/memintrinsic-unroll.ll | 13900 ++++++++-------- .../AMDGPU/lsr-invalid-ptr-extend.ll | 16 +- 12 files changed, 7463 insertions(+), 7429 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index b78e279ebcafe..a0c5b15765249 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1576,46 +1576,66 @@ unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const { return BaseT::getNumberOfParts(Tp); } +InstructionCost GCNTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + StackOffset BaseOffset, + bool HasBaseReg, int64_t Scale, + unsigned AddrSpace) const { + // AMDGPU has limited addressing modes. base+scale*index requires an extra + // ADD instruction, unlike architectures with rich addressing modes. + if (HasBaseReg && Scale != 0) + return 1; + return BaseT::getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale, + AddrSpace); +} + bool GCNTTIImpl::isLSRCostLess(const TTI::LSRCost &A, const TTI::LSRCost &B) const { const GCNSubtarget &ST = *static_cast(getST()); // GFX9+: favor lower per-iteration work first; preheader/setup only as tie-breakers. if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS + 1) { - // 1) Total per-iteration instructions. This already includes base-adds, IV muls, etc. - if (A.Insns != B.Insns) { - dbgs() << "MS: Insns different, A.Insns = " << A.Insns << ", B.Insns == " << B.Insns << "\n"; - return A.Insns < B.Insns; + // AMDGPU lacks rich addressing modes; base+scale*index requires separate ADD. + // Include ScaleCost in effective per-iteration instruction count. + unsigned EffInsnsA = A.Insns + A.ScaleCost; + unsigned EffInsnsB = B.Insns + B.ScaleCost; + + // 1) Effective per-iteration instructions (includes addressing complexity). + if (EffInsnsA != EffInsnsB) { + // dbgs() << "MS: EffInsns different, A=" << EffInsnsA << " (Insns=" << A.Insns + // << "+ScaleCost=" << A.ScaleCost << "), B=" << EffInsnsB + // << " (Insns=" << B.Insns << "+ScaleCost=" << B.ScaleCost << ")\n"; + return EffInsnsA < EffInsnsB; } // 2) Strongly prefer fewer IV multiplications (mul/mul_hi/addc chains are costly on AMDGPU). if (A.NumIVMuls != B.NumIVMuls) { - dbgs() << "MS: NumIVMuls different, A.NumIVMuls = " << A.NumIVMuls << ", B.NumIVMuls == " << B.NumIVMuls << "\n"; + // dbgs() << "MS: NumIVMuls different, A.NumIVMuls = " << A.NumIVMuls << ", B.NumIVMuls == " << B.NumIVMuls << "\n"; return A.NumIVMuls < B.NumIVMuls; } // 3) AddRecCost: per-iteration cost of IV updates (fewer IVs = lower cost). if (A.AddRecCost != B.AddRecCost) { - dbgs() << "MS: AddRecCost different, A.AddRecCost = " << A.AddRecCost << ", B.AddRecCost == " << B.AddRecCost << "\n"; + // dbgs() << "MS: AddRecCost different, A.AddRecCost = " << A.AddRecCost << ", B.AddRecCost == " << B.AddRecCost << "\n"; return A.AddRecCost < B.AddRecCost; } // 4) Prefer fewer per-iteration base adds as a tie-breaker. if (A.NumBaseAdds != B.NumBaseAdds) { - dbgs() << "MS: NumBaseAdds different, A.NumBaseAdds = " << A.NumBaseAdds << ", B.NumBaseAdds == " << B.NumBaseAdds << "\n"; + // dbgs() << "MS: NumBaseAdds different, A.NumBaseAdds = " << A.NumBaseAdds << ", B.NumBaseAdds == " << B.NumBaseAdds << "\n"; return A.NumBaseAdds < B.NumBaseAdds; } // 5) Preheader-related costs. if (A.SetupCost != B.SetupCost) { - dbgs() << "MS: SetupCost different, A.SetupCost = " << A.SetupCost << ", B.SetupCost == " << B.SetupCost << "\n"; + // dbgs() << "MS: SetupCost different, A.SetupCost = " << A.SetupCost << ", B.SetupCost == " << B.SetupCost << "\n"; return A.SetupCost < B.SetupCost; } - // 6) Minor keys to stabilize ordering. - if (A.ScaleCost != B.ScaleCost) { - dbgs() << "MS: ScaleCost different, A.ScaleCost = " << A.ScaleCost << ", B.ScaleCost == " << B.ScaleCost << "\n"; - return A.ScaleCost < B.ScaleCost; + // 6) Minor keys to stabilize ordering (ImmCost, NumRegs). + // ScaleCost already accounted for in EffInsns, so not compared separately. + if (A.ImmCost != B.ImmCost) { + // dbgs() << "MS: ImmCost different, A.ImmCost = " << A.ImmCost << ", B.ImmCost == " << B.ImmCost << "\n"; + return A.ImmCost < B.ImmCost; } return A.NumRegs < B.NumRegs; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index d0929011c50c7..6bb19e2da5183 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -303,6 +303,11 @@ class GCNTTIImpl final : public BasicTTIImplBase { /// implementation. unsigned getNumberOfParts(Type *Tp) const override; + InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, + StackOffset BaseOffset, + bool HasBaseReg, int64_t Scale, + unsigned AddrSpace) const; + bool isLSRCostLess(const TTI::LSRCost &A, const TTI::LSRCost &B) const; bool isNumRegsMajorCostOfLSR(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll index 9a3525edc4b34..d4e5487828c48 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll @@ -5,14 +5,14 @@ define void @temporal_divergent_i32(float %val, ptr %addr) { ; GFX10-LABEL: temporal_divergent_i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s5, -1 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB0_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_add_i32 s5, s5, 1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s5 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: s_add_i32 s5, s5, 1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 @@ -40,14 +40,14 @@ define void @temporal_divergent_i32_multiple_use(float %val, ptr %addr, ptr %add ; GFX10-LABEL: temporal_divergent_i32_multiple_use: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s5, -1 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: .LBB1_1: ; %loop ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_add_i32 s5, s5, 1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v5, s5 ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, s5 -; GFX10-NEXT: s_add_i32 s5, s5, 1 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB1_1 diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll index 931a14473c340..f5223d5553c6a 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll @@ -17,14 +17,13 @@ define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a, ; GFX7-NEXT: s_add_u32 s12, s12, s11 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_mov_b32 s0, 0 -; GFX7-NEXT: s_mov_b32 s1, 0 ; GFX7-NEXT: .LBB0_1: ; %loop ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_add_i32 s1, s1, 1 -; GFX7-NEXT: s_add_i32 s0, s0, 4 -; GFX7-NEXT: s_cmp_lt_u32 s1, 16 +; GFX7-NEXT: s_lshl_b32 s1, s0, 2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_i32 s0, s0, 1 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_cmp_lt_u32 s0, 16 ; GFX7-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GFX7-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX7-NEXT: ; %bb.2: ; %done @@ -45,14 +44,13 @@ define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a, ; GFX8-NEXT: s_add_u32 s88, s88, s11 ; GFX8-NEXT: s_addc_u32 s89, s89, 0 ; GFX8-NEXT: s_mov_b32 s0, 0 -; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: .LBB0_1: ; %loop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: s_add_i32 s1, s1, 1 -; GFX8-NEXT: s_add_i32 s0, s0, 4 -; GFX8-NEXT: s_cmp_lt_u32 s1, 16 +; GFX8-NEXT: s_lshl_b32 s1, s0, 2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_add_i32 s0, s0, 1 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_cmp_lt_u32 s0, 16 ; GFX8-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GFX8-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX8-NEXT: ; %bb.2: ; %done diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll index 31344c78990b8..c27a12f4588ee 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -2104,36 +2104,36 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) { ; GFX1250-SDAG-LABEL: flat_addr_64bit_lsr_iv: ; GFX1250-SDAG: ; %bb.0: ; %bb ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_movk_i32 s0, 0x100 ; GFX1250-SDAG-NEXT: .LBB116_1: ; %bb3 ; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] -; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 +; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s0, -1 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4 +; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0 ; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB116_1 ; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv: ; GFX1250-GISEL: ; %bb.0: ; %bb -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-GISEL-NEXT: s_movk_i32 s0, 0x100 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-GISEL-NEXT: .LBB116_1: ; %bb3 ; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo -; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3] -; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, s0, v0, 4 +; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB116_1 ; GFX1250-GISEL-NEXT: ; %bb.2: ; %bb2 ; GFX1250-GISEL-NEXT: s_endpgm @@ -2159,42 +2159,40 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre ; GFX1250-SDAG-LABEL: flat_addr_64bit_lsr_iv_multiload: ; GFX1250-SDAG: ; %bb.0: ; %bb ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_movk_i32 s0, 0x100 ; GFX1250-SDAG-NEXT: .LBB117_1: ; %bb3 ; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] -; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 -; GFX1250-SDAG-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s0, -1 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4 +; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0 ; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB117_1 ; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv_multiload: ; GFX1250-GISEL: ; %bb.0: ; %bb -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-GISEL-NEXT: s_movk_i32 s0, 0x100 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3 ; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo -; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3] -; GFX1250-GISEL-NEXT: ; kill: killed $vgpr4 killed $vgpr5 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, s0, v0, 4 +; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB117_1 ; GFX1250-GISEL-NEXT: ; %bb.2: ; %bb2 ; GFX1250-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 9ebf6ae88a517..a405f7888423f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -4722,17 +4722,16 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr a define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX9-LABEL: global_addr_64bit_lsr_iv: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_movk_i32 s0, 0x100 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: .LBB132_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_add_u32 s4, s2, s0 -; GFX9-NEXT: s_addc_u32 s5, s3, s1 -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s0, 4 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400 +; GFX9-NEXT: s_add_i32 s0, s0, -1 +; GFX9-NEXT: s_add_u32 s2, s2, 4 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB132_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -4740,17 +4739,16 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX10-LABEL: global_addr_64bit_lsr_iv: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_movk_i32 s0, 0x100 ; GFX10-NEXT: .LBB132_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_add_u32 s4, s2, s0 -; GFX10-NEXT: s_addc_u32 s5, s3, s1 -; GFX10-NEXT: s_add_u32 s0, s0, 4 -; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400 +; GFX10-NEXT: s_add_i32 s0, s0, -1 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 s2, s2, 4 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB132_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -4758,17 +4756,15 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX11-LABEL: global_addr_64bit_lsr_iv: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: s_movk_i32 s0, 0x100 ; GFX11-NEXT: .LBB132_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_u32 s4, s2, s0 -; GFX11-NEXT: s_addc_u32 s5, s3, s1 -; GFX11-NEXT: s_add_u32 s0, s0, 4 -; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_add_i32 s0, s0, -1 +; GFX11-NEXT: s_add_u32 s2, s2, 4 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB132_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm @@ -4776,38 +4772,34 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX12-SDAG-LABEL: global_addr_64bit_lsr_iv: ; GFX12-SDAG: ; %bb.0: ; %bb ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x100 ; GFX12-SDAG-NEXT: .LBB132_1: ; %bb3 ; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 -; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -1 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4 +; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0 ; GFX12-SDAG-NEXT: s_cbranch_scc0 .LBB132_1 ; GFX12-SDAG-NEXT: ; %bb.2: ; %bb2 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x100 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffd -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc -; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffd -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc -; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off scope:SCOPE_SYS +; GFX12-GISEL-NEXT: global_load_b32 v3, v[0:1], off scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0x400, v2 +; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, s[0:1], v0, 4 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX12-GISEL-NEXT: s_cbranch_vccz .LBB132_1 ; GFX12-GISEL-NEXT: ; %bb.2: ; %bb2 ; GFX12-GISEL-NEXT: s_endpgm @@ -4832,20 +4824,18 @@ bb3: ; preds = %bb3, %bb define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg %arg, ptr addrspace(1) inreg %arg.1) { ; GFX9-LABEL: global_addr_64bit_lsr_iv_multiload: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: s_movk_i32 s0, 0x100 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: .LBB133_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_add_u32 s4, s2, s0 -; GFX9-NEXT: s_addc_u32 s5, s3, s1 -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s0, 4 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400 -; GFX9-NEXT: ; kill: killed $sgpr4 killed $sgpr5 +; GFX9-NEXT: s_add_i32 s0, s0, -1 +; GFX9-NEXT: s_add_u32 s2, s2, 4 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB133_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -4853,20 +4843,18 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX10-LABEL: global_addr_64bit_lsr_iv_multiload: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: s_movk_i32 s0, 0x100 ; GFX10-NEXT: .LBB133_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_add_u32 s4, s2, s0 -; GFX10-NEXT: s_addc_u32 s5, s3, s1 -; GFX10-NEXT: s_add_u32 s0, s0, 4 -; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400 -; GFX10-NEXT: ; kill: killed $sgpr4 killed $sgpr5 +; GFX10-NEXT: s_add_i32 s0, s0, -1 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_add_u32 s2, s2, 4 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: s_cbranch_scc0 .LBB133_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -4874,19 +4862,17 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX11-LABEL: global_addr_64bit_lsr_iv_multiload: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: s_movk_i32 s0, 0x100 ; GFX11-NEXT: .LBB133_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_u32 s4, s2, s0 -; GFX11-NEXT: s_addc_u32 s5, s3, s1 -; GFX11-NEXT: s_add_u32 s0, s0, 4 -; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400 +; GFX11-NEXT: s_add_i32 s0, s0, -1 +; GFX11-NEXT: s_add_u32 s2, s2, 4 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB133_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm @@ -4894,42 +4880,38 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX12-SDAG-LABEL: global_addr_64bit_lsr_iv_multiload: ; GFX12-SDAG: ; %bb.0: ; %bb ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x100 ; GFX12-SDAG-NEXT: .LBB133_1: ; %bb3 ; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 -; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -1 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4 +; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0 ; GFX12-SDAG-NEXT: s_cbranch_scc0 .LBB133_1 ; GFX12-SDAG-NEXT: ; %bb.2: ; %bb2 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv_multiload: ; GFX12-GISEL: ; %bb.0: ; %bb -; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x100 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-GISEL-NEXT: .LBB133_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffd -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc -; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffd -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc -; GFX12-GISEL-NEXT: global_load_b32 v6, v[4:5], off scope:SCOPE_SYS +; GFX12-GISEL-NEXT: global_load_b32 v3, v[0:1], off scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off scope:SCOPE_SYS +; GFX12-GISEL-NEXT: global_load_b32 v3, v[0:1], off scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0x400, v2 +; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2 +; GFX12-GISEL-NEXT: v_add_co_u32 v0, s[0:1], v0, 4 +; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s[0:1] +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX12-GISEL-NEXT: s_cbranch_vccz .LBB133_1 ; GFX12-GISEL-NEXT: ; %bb.2: ; %bb2 ; GFX12-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 835818fb2fd15..4b524481b38c1 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -8,44 +8,39 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_sub_i32 s4, 0, s6 +; GFX9-NEXT: s_sub_i32 s2, 0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v1 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s8, s5, s4 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_mul_i32 s2, s2, s4 +; GFX9-NEXT: s_mul_hi_u32 s2, s4, s2 +; GFX9-NEXT: s_add_i32 s4, s4, s2 +; GFX9-NEXT: s_mov_b32 s2, s3 ; GFX9-NEXT: .LBB0_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_not_b32 s10, s5 -; GFX9-NEXT: s_mul_i32 s9, s6, s5 -; GFX9-NEXT: s_mul_i32 s10, s6, s10 -; GFX9-NEXT: s_add_i32 s11, s5, 1 -; GFX9-NEXT: s_sub_i32 s9, s7, s9 -; GFX9-NEXT: s_add_i32 s10, s7, s10 -; GFX9-NEXT: s_cmp_ge_u32 s9, s6 -; GFX9-NEXT: s_cselect_b32 s11, s11, s5 -; GFX9-NEXT: s_cselect_b32 s9, s10, s9 -; GFX9-NEXT: s_add_i32 s10, s11, 1 -; GFX9-NEXT: s_cmp_ge_u32 s9, s6 -; GFX9-NEXT: s_cselect_b32 s9, s10, s11 -; GFX9-NEXT: s_add_u32 s10, s0, s2 -; GFX9-NEXT: s_addc_u32 s11, s1, s3 -; GFX9-NEXT: s_add_i32 s7, s7, 1 -; GFX9-NEXT: s_add_u32 s4, s4, s8 -; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_add_u32 s2, s2, 4 -; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000 -; GFX9-NEXT: global_store_dword v0, v1, s[10:11] +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s4 +; GFX9-NEXT: s_mul_i32 s7, s5, s6 +; GFX9-NEXT: s_sub_i32 s7, s2, s7 +; GFX9-NEXT: s_add_i32 s8, s5, 1 +; GFX9-NEXT: s_sub_i32 s9, s7, s6 +; GFX9-NEXT: s_cmp_ge_u32 s7, s6 +; GFX9-NEXT: s_cselect_b32 s5, s8, s5 +; GFX9-NEXT: s_cselect_b32 s7, s9, s7 +; GFX9-NEXT: s_add_i32 s8, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s7, s6 +; GFX9-NEXT: s_cselect_b32 s5, s8, s5 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s8, s0, s8 +; GFX9-NEXT: s_addc_u32 s9, s1, s9 +; GFX9-NEXT: s_add_i32 s2, s2, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -55,45 +50,40 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX10-NEXT: s_sub_i32 s2, 0, s6 +; GFX10-NEXT: s_sub_i32 s3, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mul_i32 s2, s2, s4 -; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2 -; GFX10-NEXT: s_mov_b64 s[2:3], 0 -; GFX10-NEXT: s_add_i32 s8, s4, s5 -; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: s_mul_i32 s3, s3, s2 +; GFX10-NEXT: s_mul_hi_u32 s4, s2, s3 +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_add_i32 s4, s2, s4 +; GFX10-NEXT: s_mov_b32 s2, s3 ; GFX10-NEXT: .LBB0_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_mul_hi_u32 s5, s2, s4 +; GFX10-NEXT: s_mul_i32 s7, s5, s6 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_not_b32 s10, s5 -; GFX10-NEXT: s_mul_i32 s9, s6, s5 -; GFX10-NEXT: s_mul_i32 s10, s6, s10 -; GFX10-NEXT: s_sub_i32 s9, s7, s9 -; GFX10-NEXT: s_add_i32 s11, s5, 1 -; GFX10-NEXT: s_add_i32 s10, s7, s10 -; GFX10-NEXT: s_cmp_ge_u32 s9, s6 -; GFX10-NEXT: s_cselect_b32 s11, s11, s5 -; GFX10-NEXT: s_cselect_b32 s9, s10, s9 -; GFX10-NEXT: s_add_i32 s10, s11, 1 -; GFX10-NEXT: s_cmp_ge_u32 s9, s6 -; GFX10-NEXT: s_cselect_b32 s9, s10, s11 -; GFX10-NEXT: s_add_u32 s10, s0, s2 -; GFX10-NEXT: s_addc_u32 s11, s1, s3 -; GFX10-NEXT: s_add_i32 s7, s7, 1 -; GFX10-NEXT: s_add_u32 s4, s4, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_addc_u32 s5, s5, 0 -; GFX10-NEXT: s_add_u32 s2, s2, 4 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000 -; GFX10-NEXT: global_store_dword v0, v1, s[10:11] +; GFX10-NEXT: s_add_i32 s8, s5, 1 +; GFX10-NEXT: s_sub_i32 s7, s2, s7 +; GFX10-NEXT: s_sub_i32 s9, s7, s6 +; GFX10-NEXT: s_cmp_ge_u32 s7, s6 +; GFX10-NEXT: s_cselect_b32 s5, s8, s5 +; GFX10-NEXT: s_cselect_b32 s7, s9, s7 +; GFX10-NEXT: s_add_i32 s8, s5, 1 +; GFX10-NEXT: s_cmp_ge_u32 s7, s6 +; GFX10-NEXT: s_cselect_b32 s5, s8, s5 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], 2 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_add_u32 s8, s0, s8 +; GFX10-NEXT: s_addc_u32 s9, s1, s9 +; GFX10-NEXT: s_add_i32 s2, s2, 1 +; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] ; GFX10-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -103,49 +93,46 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s7, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX11-NEXT: s_sub_i32 s2, 0, s6 +; GFX11-NEXT: s_sub_i32 s3, 0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_mul_i32 s2, s2, s4 -; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2 -; GFX11-NEXT: s_mov_b64 s[2:3], 0 -; GFX11-NEXT: s_add_i32 s8, s4, s5 -; GFX11-NEXT: s_mov_b64 s[4:5], 0 +; GFX11-NEXT: s_mul_i32 s3, s3, s2 +; GFX11-NEXT: s_mul_hi_u32 s4, s2, s3 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_add_i32 s4, s2, s4 +; GFX11-NEXT: s_mov_b32 s2, s3 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB0_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_hi_u32 s5, s2, s4 +; GFX11-NEXT: s_mul_i32 s7, s5, s6 +; GFX11-NEXT: s_add_i32 s8, s5, 1 +; GFX11-NEXT: s_sub_i32 s7, s2, s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_i32 s9, s7, s6 +; GFX11-NEXT: s_cmp_ge_u32 s7, s6 +; GFX11-NEXT: s_cselect_b32 s5, s8, s5 +; GFX11-NEXT: s_cselect_b32 s7, s9, s7 +; GFX11-NEXT: s_add_i32 s8, s5, 1 +; GFX11-NEXT: s_cmp_ge_u32 s7, s6 +; GFX11-NEXT: s_cselect_b32 s5, s8, s5 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-NEXT: s_add_u32 s8, s0, s8 +; GFX11-NEXT: s_addc_u32 s9, s1, s9 +; GFX11-NEXT: s_add_i32 s2, s2, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s10, s5 -; GFX11-NEXT: s_mul_i32 s9, s6, s5 -; GFX11-NEXT: s_mul_i32 s10, s6, s10 -; GFX11-NEXT: s_sub_i32 s9, s7, s9 -; GFX11-NEXT: s_add_i32 s11, s5, 1 -; GFX11-NEXT: s_add_i32 s10, s7, s10 -; GFX11-NEXT: s_cmp_ge_u32 s9, s6 -; GFX11-NEXT: s_cselect_b32 s11, s11, s5 -; GFX11-NEXT: s_cselect_b32 s9, s10, s9 -; GFX11-NEXT: s_add_i32 s10, s11, 1 -; GFX11-NEXT: s_cmp_ge_u32 s9, s6 -; GFX11-NEXT: s_cselect_b32 s9, s10, s11 -; GFX11-NEXT: s_add_u32 s10, s0, s2 -; GFX11-NEXT: s_addc_u32 s11, s1, s3 -; GFX11-NEXT: s_add_i32 s7, s7, 1 -; GFX11-NEXT: s_add_u32 s4, s4, s8 -; GFX11-NEXT: v_mov_b32_e32 v1, s9 -; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_add_u32 s2, s2, 4 -; GFX11-NEXT: s_addc_u32 s3, s3, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000 -; GFX11-NEXT: global_store_b32 v0, v1, s[10:11] +; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX11-NEXT: global_store_b32 v0, v1, s[8:9] ; GFX11-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm @@ -171,42 +158,37 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_sub_i32 s4, 0, s6 +; GFX9-NEXT: s_sub_i32 s2, 0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v1 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s8, s5, s4 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_mul_i32 s2, s2, s4 +; GFX9-NEXT: s_mul_hi_u32 s2, s4, s2 +; GFX9-NEXT: s_add_i32 s4, s4, s2 +; GFX9-NEXT: s_mov_b32 s2, s3 ; GFX9-NEXT: .LBB1_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_not_b32 s10, s5 -; GFX9-NEXT: s_mul_i32 s9, s6, s5 -; GFX9-NEXT: s_mul_i32 s10, s6, s10 -; GFX9-NEXT: s_sub_i32 s9, s7, s9 -; GFX9-NEXT: s_add_i32 s10, s7, s10 -; GFX9-NEXT: s_cmp_ge_u32 s9, s6 -; GFX9-NEXT: s_cselect_b32 s9, s10, s9 -; GFX9-NEXT: s_sub_i32 s10, s9, s6 -; GFX9-NEXT: s_cmp_ge_u32 s9, s6 -; GFX9-NEXT: s_cselect_b32 s9, s10, s9 -; GFX9-NEXT: s_add_u32 s10, s0, s2 -; GFX9-NEXT: s_addc_u32 s11, s1, s3 -; GFX9-NEXT: s_add_i32 s7, s7, 1 -; GFX9-NEXT: s_add_u32 s4, s4, s8 -; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_add_u32 s2, s2, 4 -; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000 -; GFX9-NEXT: global_store_dword v0, v1, s[10:11] +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s4 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_sub_i32 s5, s2, s5 +; GFX9-NEXT: s_sub_i32 s7, s5, s6 +; GFX9-NEXT: s_cmp_ge_u32 s5, s6 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_sub_i32 s7, s5, s6 +; GFX9-NEXT: s_cmp_ge_u32 s5, s6 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], 2 +; GFX9-NEXT: s_add_u32 s8, s0, s8 +; GFX9-NEXT: s_addc_u32 s9, s1, s9 +; GFX9-NEXT: s_add_i32 s2, s2, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -216,43 +198,38 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX10-NEXT: s_sub_i32 s2, 0, s6 +; GFX10-NEXT: s_sub_i32 s3, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mul_i32 s2, s2, s4 -; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2 -; GFX10-NEXT: s_mov_b64 s[2:3], 0 -; GFX10-NEXT: s_add_i32 s8, s4, s5 -; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: s_mul_i32 s3, s3, s2 +; GFX10-NEXT: s_mul_hi_u32 s4, s2, s3 +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_add_i32 s4, s2, s4 +; GFX10-NEXT: s_mov_b32 s2, s3 ; GFX10-NEXT: .LBB1_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_not_b32 s9, s5 +; GFX10-NEXT: s_mul_hi_u32 s5, s2, s4 +; GFX10-NEXT: s_mul_i32 s5, s5, s6 +; GFX10-NEXT: s_sub_i32 s5, s2, s5 +; GFX10-NEXT: s_sub_i32 s7, s5, s6 +; GFX10-NEXT: s_cmp_ge_u32 s5, s6 +; GFX10-NEXT: s_cselect_b32 s5, s7, s5 +; GFX10-NEXT: s_sub_i32 s7, s5, s6 +; GFX10-NEXT: s_cmp_ge_u32 s5, s6 +; GFX10-NEXT: s_cselect_b32 s5, s7, s5 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mul_i32 s10, s6, s5 -; GFX10-NEXT: s_mul_i32 s9, s6, s9 -; GFX10-NEXT: s_sub_i32 s10, s7, s10 -; GFX10-NEXT: s_add_i32 s9, s7, s9 -; GFX10-NEXT: s_cmp_ge_u32 s10, s6 -; GFX10-NEXT: s_cselect_b32 s9, s9, s10 -; GFX10-NEXT: s_sub_i32 s10, s9, s6 -; GFX10-NEXT: s_cmp_ge_u32 s9, s6 -; GFX10-NEXT: s_cselect_b32 s9, s10, s9 -; GFX10-NEXT: s_add_u32 s10, s0, s2 -; GFX10-NEXT: s_addc_u32 s11, s1, s3 -; GFX10-NEXT: s_add_i32 s7, s7, 1 -; GFX10-NEXT: s_add_u32 s4, s4, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_addc_u32 s5, s5, 0 -; GFX10-NEXT: s_add_u32 s2, s2, 4 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000 -; GFX10-NEXT: global_store_dword v0, v1, s[10:11] +; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], 2 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_add_u32 s8, s0, s8 +; GFX10-NEXT: s_addc_u32 s9, s1, s9 +; GFX10-NEXT: s_add_i32 s2, s2, 1 +; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] ; GFX10-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -262,48 +239,45 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s7, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX11-NEXT: s_sub_i32 s2, 0, s6 +; GFX11-NEXT: s_sub_i32 s3, 0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_mul_i32 s2, s2, s4 -; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2 -; GFX11-NEXT: s_mov_b64 s[2:3], 0 -; GFX11-NEXT: s_add_i32 s8, s4, s5 -; GFX11-NEXT: s_mov_b64 s[4:5], 0 +; GFX11-NEXT: s_mul_i32 s3, s3, s2 +; GFX11-NEXT: s_mul_hi_u32 s4, s2, s3 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_add_i32 s4, s2, s4 +; GFX11-NEXT: s_mov_b32 s2, s3 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB1_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_hi_u32 s5, s2, s4 +; GFX11-NEXT: s_mul_i32 s5, s5, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_i32 s5, s2, s5 +; GFX11-NEXT: s_sub_i32 s7, s5, s6 +; GFX11-NEXT: s_cmp_ge_u32 s5, s6 +; GFX11-NEXT: s_cselect_b32 s5, s7, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_not_b32 s9, s5 -; GFX11-NEXT: s_mul_i32 s10, s6, s5 -; GFX11-NEXT: s_mul_i32 s9, s6, s9 -; GFX11-NEXT: s_sub_i32 s10, s7, s10 -; GFX11-NEXT: s_add_i32 s9, s7, s9 -; GFX11-NEXT: s_cmp_ge_u32 s10, s6 -; GFX11-NEXT: s_cselect_b32 s9, s9, s10 +; GFX11-NEXT: s_sub_i32 s7, s5, s6 +; GFX11-NEXT: s_cmp_ge_u32 s5, s6 +; GFX11-NEXT: s_cselect_b32 s5, s7, s5 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s5 +; GFX11-NEXT: s_add_u32 s8, s0, s8 +; GFX11-NEXT: s_addc_u32 s9, s1, s9 +; GFX11-NEXT: s_add_i32 s2, s2, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s10, s9, s6 -; GFX11-NEXT: s_cmp_ge_u32 s9, s6 -; GFX11-NEXT: s_cselect_b32 s9, s10, s9 -; GFX11-NEXT: s_add_u32 s10, s0, s2 -; GFX11-NEXT: s_addc_u32 s11, s1, s3 -; GFX11-NEXT: s_add_i32 s7, s7, 1 -; GFX11-NEXT: s_add_u32 s4, s4, s8 -; GFX11-NEXT: v_mov_b32_e32 v1, s9 -; GFX11-NEXT: s_addc_u32 s5, s5, 0 -; GFX11-NEXT: s_add_u32 s2, s2, 4 -; GFX11-NEXT: s_addc_u32 s3, s3, 0 -; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000 -; GFX11-NEXT: global_store_b32 v0, v1, s[10:11] +; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX11-NEXT: global_store_b32 v0, v1, s[8:9] ; GFX11-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll index 34a9624cb19eb..e8a7a11afda0d 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll @@ -19,11 +19,10 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: .LBB0_1: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; CHECK-NEXT: v_add_nc_u32_e32 v3, -4, v3 ; CHECK-NEXT: .LBB0_2: ; %Flow1 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7 -; CHECK-NEXT: v_cmp_ne_u32_e64 s5, 0, v1 +; CHECK-NEXT: v_cmp_ne_u32_e64 s5, 0, v3 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; j lastloop entry ; CHECK-NEXT: ;;#ASMEND @@ -33,40 +32,41 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-NEXT: .LBB0_3: ; %for.body33 ; CHECK-NEXT: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB0_6 Depth 2 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_2 ; CHECK-NEXT: ; %bb.4: ; %for.body51.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: s_mov_b32 s9, 4 ; CHECK-NEXT: s_mov_b32 s8, 0 -; CHECK-NEXT: s_mov_b32 s9, 0 ; CHECK-NEXT: s_branch .LBB0_6 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_5: ; %if.end118 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: v_add_nc_u32_e32 v4, 4, v1 ; CHECK-NEXT: s_add_i32 s9, s9, 4 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; backedge ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_add_nc_u32_e32 v3, s9, v2 -; CHECK-NEXT: v_cmp_ge_u32_e64 s5, v3, v0 +; CHECK-NEXT: v_cmp_ge_u32_e64 s5, v4, v0 ; CHECK-NEXT: s_or_b32 s8, s5, s8 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 ; CHECK-NEXT: s_cbranch_execz .LBB0_1 ; CHECK-NEXT: .LBB0_6: ; %for.body51 ; CHECK-NEXT: ; Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: v_mov_b32_e32 v1, v4 +; CHECK-NEXT: v_mov_b32_e32 v3, 1 ; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB0_5 ; CHECK-NEXT: ; %bb.7: ; %if.then112 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 -; CHECK-NEXT: s_add_i32 s10, s9, 4 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_mov_b32_e32 v3, s10 -; CHECK-NEXT: ds_write_b32 v1, v3 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, s9 +; CHECK-NEXT: ds_write_b32 v3, v4 ; CHECK-NEXT: s_branch .LBB0_5 ; CHECK-NEXT: .LBB0_8: ; %for.body159.preheader ; CHECK-NEXT: s_inst_prefetch 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index 680942fcb4d4b..15c4e746b1e07 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -62,7 +62,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] -; CHECK-NEXT: v_mov_b32_e32 v45, 0 +; CHECK-NEXT: v_mov_b32_e32 v46, 0 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v43, v0 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -91,7 +91,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s12, s51 ; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: ds_write_b32 v45, v45 offset:15360 +; CHECK-NEXT: ds_write_b32 v46, v46 offset:15360 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v43 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v43 @@ -118,69 +118,66 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_cmpx_ne_u32_e32 0, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_25 ; CHECK-NEXT: ; %bb.1: ; %.preheader5 -; CHECK-NEXT: v_mul_lo_u32 v0, v41, 14 +; CHECK-NEXT: v_mul_lo_u32 v44, v41, 14 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_mov_b32 s5, 0 -; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v0 +; CHECK-NEXT: v_add_nc_u32_e32 v45, 0x3c04, v44 ; CHECK-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_nc_u32_e32 v1, s5, v44 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s5, v45 ; CHECK-NEXT: s_add_i32 s5, s5, 1 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v42 -; CHECK-NEXT: ds_write_b8 v1, v45 +; CHECK-NEXT: ds_write_b8 v0, v46 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB0_2 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v42 +; CHECK-NEXT: v_add_nc_u32_e32 v46, -1, v42 ; CHECK-NEXT: s_mov_b32 s53, 0 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v46 ; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo ; CHECK-NEXT: s_cbranch_execz .LBB0_25 ; CHECK-NEXT: ; %bb.4: ; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43 -; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0 ; CHECK-NEXT: v_mov_b32_e32 v47, 0 -; CHECK-NEXT: s_mov_b32 s55, 0 +; CHECK-NEXT: s_mov_b32 s54, 0 ; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB0_8 Depth 2 ; CHECK-NEXT: ; Child Loop BB0_20 Depth 2 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s55, v44 -; CHECK-NEXT: s_lshl_b32 s4, s55, 5 -; CHECK-NEXT: s_add_i32 s54, s55, 1 -; CHECK-NEXT: s_add_i32 s5, s55, 5 -; CHECK-NEXT: v_or3_b32 v57, s4, v43, s54 +; CHECK-NEXT: s_mov_b32 s4, s54 +; CHECK-NEXT: s_lshl_b32 s5, s54, 5 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v45 +; CHECK-NEXT: s_add_i32 s54, s54, 1 +; CHECK-NEXT: s_add_i32 s4, s4, 5 +; CHECK-NEXT: v_or3_b32 v57, s5, v43, s54 +; CHECK-NEXT: v_mov_b32_e32 v58, s54 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: ds_read_u8 v56, v0 -; CHECK-NEXT: v_mov_b32_e32 v58, s54 -; CHECK-NEXT: s_mov_b32 s68, exec_lo -; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42 +; CHECK-NEXT: s_mov_b32 s55, exec_lo +; CHECK-NEXT: v_cmpx_lt_u32_e64 s4, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_17 ; CHECK-NEXT: ; %bb.6: ; %.preheader2 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_mov_b32 s69, 0 -; CHECK-NEXT: s_mov_b32 s80, 0 +; CHECK-NEXT: s_mov_b32 s68, 0 +; CHECK-NEXT: s_mov_b32 s69, s54 ; CHECK-NEXT: s_branch .LBB0_8 ; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 -; CHECK-NEXT: s_add_i32 s80, s80, 4 -; CHECK-NEXT: s_add_i32 s4, s55, s80 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s80, v57 -; CHECK-NEXT: s_add_i32 s5, s4, 5 -; CHECK-NEXT: s_add_i32 s4, s4, 1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42 -; CHECK-NEXT: v_mov_b32_e32 v58, s4 -; CHECK-NEXT: s_or_b32 s69, vcc_lo, s69 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s69 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80 +; CHECK-NEXT: s_add_i32 s4, s69, 4 +; CHECK-NEXT: v_add_nc_u32_e32 v57, 4, v57 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s4, v42 +; CHECK-NEXT: v_mov_b32_e32 v58, s69 +; CHECK-NEXT: s_or_b32 s68, vcc_lo, s68 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s68 ; CHECK-NEXT: s_cbranch_execz .LBB0_16 ; CHECK-NEXT: .LBB0_8: ; Parent Loop BB0_5 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_add_nc_u32_e32 v59, s80, v46 -; CHECK-NEXT: v_add_nc_u32_e32 v58, s80, v57 -; CHECK-NEXT: ds_read_u8 v0, v59 +; CHECK-NEXT: v_add_nc_u32_e32 v58, s69, v45 +; CHECK-NEXT: s_mov_b32 s69, s4 +; CHECK-NEXT: ds_read_u8 v0, v58 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s81, s4 +; CHECK-NEXT: v_cmp_eq_u16_sdwa s5, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD +; CHECK-NEXT: s_and_saveexec_b32 s80, s5 ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -199,13 +196,13 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CHECK-NEXT: ds_write_b32 v0, v58 +; CHECK-NEXT: ds_write_b32 v0, v57 ; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 -; CHECK-NEXT: ds_read_u8 v0, v59 offset:1 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80 +; CHECK-NEXT: ds_read_u8 v0, v58 offset:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s81, s4 +; CHECK-NEXT: s_and_saveexec_b32 s80, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_12 ; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -221,17 +218,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s12, s51 ; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: v_add_nc_u32_e32 v60, 1, v58 +; CHECK-NEXT: v_add_nc_u32_e32 v59, 1, v57 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CHECK-NEXT: ds_write_b32 v0, v60 +; CHECK-NEXT: ds_write_b32 v0, v59 ; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 -; CHECK-NEXT: ds_read_u8 v0, v59 offset:2 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80 +; CHECK-NEXT: ds_read_u8 v0, v58 offset:2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s81, s4 +; CHECK-NEXT: s_and_saveexec_b32 s80, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_14 ; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -247,17 +244,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s12, s51 ; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: v_add_nc_u32_e32 v60, 2, v58 +; CHECK-NEXT: v_add_nc_u32_e32 v59, 2, v57 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CHECK-NEXT: ds_write_b32 v0, v60 +; CHECK-NEXT: ds_write_b32 v0, v59 ; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81 -; CHECK-NEXT: ds_read_u8 v0, v59 offset:3 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80 +; CHECK-NEXT: ds_read_u8 v0, v58 offset:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD -; CHECK-NEXT: s_and_saveexec_b32 s81, s4 +; CHECK-NEXT: s_and_saveexec_b32 s80, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_7 ; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -273,19 +270,18 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s12, s51 ; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v58 +; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v57 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v58 ; CHECK-NEXT: s_branch .LBB0_7 -; CHECK-NEXT: .LBB0_16: ; %Flow43 -; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s69 -; CHECK-NEXT: v_mov_b32_e32 v57, v0 -; CHECK-NEXT: .LBB0_17: ; %Flow44 +; CHECK-NEXT: .LBB0_16: ; %Flow32 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68 +; CHECK-NEXT: .LBB0_17: ; %Flow33 +; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: s_mov_b32 s55, exec_lo ; CHECK-NEXT: v_cmpx_lt_u32_e64 v58, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_23 @@ -306,7 +302,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v58 -; CHECK-NEXT: ds_read_u8 v0, v0 +; CHECK-NEXT: ds_read_u8 v0, v0 offset:15364 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD ; CHECK-NEXT: s_and_saveexec_b32 s69, s4 @@ -330,24 +326,22 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v57 ; CHECK-NEXT: s_branch .LBB0_19 -; CHECK-NEXT: .LBB0_22: ; %Flow41 +; CHECK-NEXT: .LBB0_22: ; %Flow30 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: s_inst_prefetch 0x2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68 -; CHECK-NEXT: .LBB0_23: ; %Flow42 +; CHECK-NEXT: .LBB0_23: ; %Flow31 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s54, v45 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s54, v46 ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47 -; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46 -; CHECK-NEXT: s_mov_b32 s55, s54 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 ; CHECK-NEXT: s_or_b32 s53, s4, s53 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53 ; CHECK-NEXT: s_cbranch_execnz .LBB0_5 -; CHECK-NEXT: .LBB0_25: ; %Flow49 +; CHECK-NEXT: .LBB0_25: ; %Flow38 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 @@ -828,7 +822,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_mul_lo_u32 v46, v0, 14 +; CHECK-NEXT: v_mul_lo_u32 v44, v0, 14 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -842,7 +836,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_mov_b32 s13, s50 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: ds_write_b32 v43, v43 offset:15360 -; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v46 +; CHECK-NEXT: v_add_nc_u32_e32 v45, 0x3c04, v44 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v42 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v42 @@ -867,51 +861,48 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: v_mov_b32_e32 v41, v0 ; CHECK-NEXT: v_lshlrev_b32_e32 v42, 10, v42 ; CHECK-NEXT: s_mov_b32 s52, 0 -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: ds_write_b8 v46, v43 offset:15364 -; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v41 +; CHECK-NEXT: s_mov_b32 s53, 0 +; CHECK-NEXT: ds_write_b8 v44, v43 offset:15364 +; CHECK-NEXT: v_add_nc_u32_e32 v46, -1, v41 ; CHECK-NEXT: .LBB1_1: ; %.37 ; CHECK-NEXT: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB1_3 Depth 2 ; CHECK-NEXT: ; Child Loop BB1_8 Depth 2 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44 -; CHECK-NEXT: s_lshl_b32 s5, s4, 5 -; CHECK-NEXT: s_add_i32 s53, s4, 1 -; CHECK-NEXT: s_add_i32 s6, s4, 5 -; CHECK-NEXT: v_or3_b32 v47, s5, v42, s53 +; CHECK-NEXT: s_mov_b32 s4, s53 +; CHECK-NEXT: s_lshl_b32 s6, s53, 5 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v45 +; CHECK-NEXT: s_add_i32 s53, s53, 1 +; CHECK-NEXT: s_add_i32 s5, s4, 5 +; CHECK-NEXT: v_or3_b32 v56, s6, v42, s53 +; CHECK-NEXT: v_mov_b32_e32 v57, s53 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v46, v0 -; CHECK-NEXT: v_mov_b32_e32 v56, s53 -; CHECK-NEXT: s_mov_b32 s5, exec_lo -; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41 +; CHECK-NEXT: ds_read_u8 v47, v0 +; CHECK-NEXT: s_mov_b32 s4, exec_lo +; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v41 ; CHECK-NEXT: s_cbranch_execz .LBB1_5 ; CHECK-NEXT: ; %bb.2: ; %.53.preheader ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_mov_b32 s6, 0 -; CHECK-NEXT: s_mov_b32 s7, 0 ; CHECK-NEXT: .LBB1_3: ; %.53 ; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: s_add_i32 s7, s7, 4 +; CHECK-NEXT: s_add_i32 s7, s5, 4 ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43 -; CHECK-NEXT: s_add_i32 s8, s4, s7 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v47 -; CHECK-NEXT: s_add_i32 s9, s8, 5 -; CHECK-NEXT: s_add_i32 s8, s8, 1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41 -; CHECK-NEXT: v_mov_b32_e32 v56, s8 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s7, v41 +; CHECK-NEXT: v_add_nc_u32_e32 v56, 4, v56 +; CHECK-NEXT: v_mov_b32_e32 v57, s5 +; CHECK-NEXT: s_mov_b32 s5, s7 ; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_execnz .LBB1_3 -; CHECK-NEXT: ; %bb.4: ; %Flow3 +; CHECK-NEXT: ; %bb.4: ; %Flow4 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; CHECK-NEXT: v_mov_b32_e32 v47, v0 -; CHECK-NEXT: .LBB1_5: ; %Flow4 +; CHECK-NEXT: .LBB1_5: ; %Flow5 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_mov_b32 s54, exec_lo -; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41 +; CHECK-NEXT: v_cmpx_lt_u32_e64 v57, v41 ; CHECK-NEXT: s_cbranch_execz .LBB1_11 ; CHECK-NEXT: ; %bb.6: ; %.103.preheader ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 @@ -922,19 +913,19 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: .LBB1_7: ; %.114 ; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s64 +; CHECK-NEXT: v_add_nc_u32_e32 v57, 1, v57 ; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56 -; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v57, v41 ; CHECK-NEXT: s_or_b32 s55, vcc_lo, s55 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: s_cbranch_execz .LBB1_10 ; CHECK-NEXT: .LBB1_8: ; %.103 ; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v56 -; CHECK-NEXT: ds_read_u8 v0, v0 +; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v57 +; CHECK-NEXT: ds_read_u8 v0, v0 offset:15364 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD +; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v47, v0 src0_sel:BYTE_0 src1_sel:DWORD ; CHECK-NEXT: s_and_saveexec_b32 s64, s4 ; CHECK-NEXT: s_cbranch_execz .LBB1_7 ; CHECK-NEXT: ; %bb.9: ; %.110 @@ -955,23 +946,22 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CHECK-NEXT: ds_write_b32 v0, v47 +; CHECK-NEXT: ds_write_b32 v0, v56 ; CHECK-NEXT: s_branch .LBB1_7 ; CHECK-NEXT: .LBB1_10: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_inst_prefetch 0x2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 -; CHECK-NEXT: .LBB1_11: ; %Flow2 +; CHECK-NEXT: .LBB1_11: ; %Flow3 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s54 ; CHECK-NEXT: ; %bb.12: ; %.32 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v45 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v46 ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 ; CHECK-NEXT: s_or_b32 s52, s4, s52 -; CHECK-NEXT: s_mov_b32 s4, s53 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52 ; CHECK-NEXT: s_cbranch_execnz .LBB1_1 ; CHECK-NEXT: ; %bb.13: ; %.119 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index 647e730c8f51f..ca4f5d22ca9a0 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -7,119 +7,138 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) { ; CHECK-LABEL: issue63986: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshlrev_b64 v[2:3], 6, v[2:3] -; CHECK-NEXT: v_mov_b32_e32 v5, s17 -; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, s16, v2 -; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v3, vcc +; CHECK-NEXT: v_lshlrev_b64 v[4:5], 6, v[2:3] +; CHECK-NEXT: v_mov_b32_e32 v6, s17 +; CHECK-NEXT: v_add_co_u32_e32 v8, vcc, s16, v4 +; CHECK-NEXT: v_addc_co_u32_e32 v9, vcc, v6, v5, vcc ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: .LBB0_1: ; %loop-memcpy-expansion ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v2, s4 -; CHECK-NEXT: v_mov_b32_e32 v3, s5 -; CHECK-NEXT: flat_load_dwordx4 v[6:9], v[2:3] -; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s4, v4 +; CHECK-NEXT: v_mov_b32_e32 v7, s5 +; CHECK-NEXT: v_mov_b32_e32 v6, s4 +; CHECK-NEXT: flat_load_dwordx4 v[10:13], v[6:7] +; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, s4, v8 ; CHECK-NEXT: s_add_u32 s4, s4, 16 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: v_cmp_ge_u64_e64 s[6:7], s[4:5], 32 -; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc ; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[2:3], v[6:9] +; CHECK-NEXT: flat_store_dwordx4 v[6:7], v[10:13] ; CHECK-NEXT: s_cbranch_vccz .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %loop-memcpy-residual-header -; CHECK-NEXT: s_cbranch_execnz .LBB0_5 -; CHECK-NEXT: ; %bb.3: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: s_branch .LBB0_4 +; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: s_branch .LBB0_5 +; CHECK-NEXT: .LBB0_4: ; %loop-memcpy-residual-header.post-loop-memcpy-expansion_crit_edge +; CHECK-NEXT: v_lshlrev_b64 v[6:7], 6, v[2:3] +; CHECK-NEXT: s_cbranch_execnz .LBB0_8 +; CHECK-NEXT: .LBB0_5: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: s_add_u32 s4, s16, 32 +; CHECK-NEXT: s_addc_u32 s5, s17, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, s5 +; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s4, v4 +; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual +; CHECK-NEXT: ; %bb.6: ; %loop-memcpy-residual ; CHECK-NEXT: s_add_u32 s6, 32, s4 ; CHECK-NEXT: s_addc_u32 s7, 0, s5 -; CHECK-NEXT: v_mov_b32_e32 v2, s6 -; CHECK-NEXT: v_mov_b32_e32 v3, s7 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] -; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s6, v4 +; CHECK-NEXT: v_mov_b32_e32 v6, s6 +; CHECK-NEXT: v_mov_b32_e32 v7, s7 +; CHECK-NEXT: flat_load_ubyte v10, v[6:7] +; CHECK-NEXT: v_mov_b32_e32 v7, s5 +; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, s4, v2 +; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v3, v7, vcc ; CHECK-NEXT: s_add_u32 s4, s4, 1 -; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc ; CHECK-NEXT: s_addc_u32 s5, 0, s5 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v6 -; CHECK-NEXT: .LBB0_5: ; %post-loop-memcpy-expansion +; CHECK-NEXT: flat_store_byte v[6:7], v10 +; CHECK-NEXT: ; %bb.7: +; CHECK-NEXT: v_mov_b32_e32 v7, v5 +; CHECK-NEXT: v_mov_b32_e32 v6, v4 +; CHECK-NEXT: .LBB0_8: ; %post-loop-memcpy-expansion ; CHECK-NEXT: v_and_b32_e32 v2, 15, v0 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_and_b32_e32 v0, -16, v0 +; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, v6, v0 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v1, vcc ; CHECK-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[0:1] ; CHECK-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[2:3] -; CHECK-NEXT: s_branch .LBB0_8 -; CHECK-NEXT: .LBB0_6: ; %Flow7 -; CHECK-NEXT: ; in Loop: Header=BB0_8 Depth=1 +; CHECK-NEXT: v_mov_b32_e32 v6, s17 +; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, s16, v4 +; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc +; CHECK-NEXT: s_branch .LBB0_11 +; CHECK-NEXT: .LBB0_9: ; %Flow14 +; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 ; CHECK-NEXT: s_or_b64 exec, exec, s[10:11] ; CHECK-NEXT: s_mov_b64 s[8:9], 0 -; CHECK-NEXT: .LBB0_7: ; %Flow9 -; CHECK-NEXT: ; in Loop: Header=BB0_8 Depth=1 +; CHECK-NEXT: .LBB0_10: ; %Flow16 +; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 ; CHECK-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; CHECK-NEXT: s_cbranch_vccz .LBB0_16 -; CHECK-NEXT: .LBB0_8: ; %while.cond +; CHECK-NEXT: s_cbranch_vccz .LBB0_19 +; CHECK-NEXT: .LBB0_11: ; %while.cond ; CHECK-NEXT: ; =>This Loop Header: Depth=1 -; CHECK-NEXT: ; Child Loop BB0_10 Depth 2 -; CHECK-NEXT: ; Child Loop BB0_14 Depth 2 +; CHECK-NEXT: ; Child Loop BB0_13 Depth 2 +; CHECK-NEXT: ; Child Loop BB0_17 Depth 2 ; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_11 -; CHECK-NEXT: ; %bb.9: ; %loop-memcpy-expansion2.preheader -; CHECK-NEXT: ; in Loop: Header=BB0_8 Depth=1 +; CHECK-NEXT: s_cbranch_execz .LBB0_14 +; CHECK-NEXT: ; %bb.12: ; %loop-memcpy-expansion2.preheader +; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 ; CHECK-NEXT: s_mov_b64 s[10:11], 0 ; CHECK-NEXT: s_mov_b64 s[12:13], 0 -; CHECK-NEXT: .LBB0_10: ; %loop-memcpy-expansion2 -; CHECK-NEXT: ; Parent Loop BB0_8 Depth=1 +; CHECK-NEXT: .LBB0_13: ; %loop-memcpy-expansion2 +; CHECK-NEXT: ; Parent Loop BB0_11 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v6, s12 ; CHECK-NEXT: v_mov_b32_e32 v7, s13 -; CHECK-NEXT: flat_load_dwordx4 v[6:9], v[6:7] -; CHECK-NEXT: v_mov_b32_e32 v11, s13 -; CHECK-NEXT: v_add_co_u32_e32 v10, vcc, s12, v4 +; CHECK-NEXT: flat_load_dwordx4 v[10:13], v[6:7] +; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, s12, v8 ; CHECK-NEXT: s_add_u32 s12, s12, 16 -; CHECK-NEXT: v_addc_co_u32_e32 v11, vcc, v5, v11, vcc +; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc ; CHECK-NEXT: s_addc_u32 s13, s13, 0 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc, s[12:13], v[0:1] ; CHECK-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[10:11], v[6:9] +; CHECK-NEXT: flat_store_dwordx4 v[6:7], v[10:13] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[10:11] -; CHECK-NEXT: s_cbranch_execnz .LBB0_10 -; CHECK-NEXT: .LBB0_11: ; %Flow8 -; CHECK-NEXT: ; in Loop: Header=BB0_8 Depth=1 +; CHECK-NEXT: s_cbranch_execnz .LBB0_13 +; CHECK-NEXT: .LBB0_14: ; %Flow15 +; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 ; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] ; CHECK-NEXT: s_mov_b64 s[8:9], -1 -; CHECK-NEXT: s_cbranch_execz .LBB0_7 -; CHECK-NEXT: ; %bb.12: ; %loop-memcpy-residual-header5 -; CHECK-NEXT: ; in Loop: Header=BB0_8 Depth=1 +; CHECK-NEXT: s_cbranch_execz .LBB0_10 +; CHECK-NEXT: ; %bb.15: ; %loop-memcpy-residual-header5 +; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 ; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] ; CHECK-NEXT: s_xor_b64 s[10:11], exec, s[8:9] -; CHECK-NEXT: s_cbranch_execz .LBB0_6 -; CHECK-NEXT: ; %bb.13: ; %loop-memcpy-residual4.preheader -; CHECK-NEXT: ; in Loop: Header=BB0_8 Depth=1 +; CHECK-NEXT: s_cbranch_execz .LBB0_9 +; CHECK-NEXT: ; %bb.16: ; %loop-memcpy-residual4.preheader +; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 ; CHECK-NEXT: s_mov_b64 s[12:13], 0 ; CHECK-NEXT: s_mov_b64 s[14:15], 0 -; CHECK-NEXT: .LBB0_14: ; %loop-memcpy-residual4 -; CHECK-NEXT: ; Parent Loop BB0_8 Depth=1 +; CHECK-NEXT: .LBB0_17: ; %loop-memcpy-residual4 +; CHECK-NEXT: ; Parent Loop BB0_11 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_mov_b32_e32 v7, s15 +; CHECK-NEXT: v_mov_b32_e32 v10, s15 ; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, s14, v0 -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v7, vcc -; CHECK-NEXT: flat_load_ubyte v8, v[6:7] +; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v10, vcc +; CHECK-NEXT: flat_load_ubyte v11, v[6:7] +; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, s14, v4 ; CHECK-NEXT: s_add_u32 s14, s14, 1 ; CHECK-NEXT: s_addc_u32 s15, s15, 0 ; CHECK-NEXT: v_cmp_ge_u64_e64 s[8:9], s[14:15], v[2:3] -; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6 -; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v5, v7, vcc +; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, v5, v10, vcc ; CHECK-NEXT: s_or_b64 s[12:13], s[8:9], s[12:13] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[6:7], v8 +; CHECK-NEXT: flat_store_byte v[6:7], v11 ; CHECK-NEXT: s_andn2_b64 exec, exec, s[12:13] -; CHECK-NEXT: s_cbranch_execnz .LBB0_14 -; CHECK-NEXT: ; %bb.15: ; %Flow -; CHECK-NEXT: ; in Loop: Header=BB0_8 Depth=1 +; CHECK-NEXT: s_cbranch_execnz .LBB0_17 +; CHECK-NEXT: ; %bb.18: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB0_11 Depth=1 ; CHECK-NEXT: s_or_b64 exec, exec, s[12:13] -; CHECK-NEXT: s_branch .LBB0_6 -; CHECK-NEXT: .LBB0_16: ; %DummyReturnBlock +; CHECK-NEXT: s_branch .LBB0_9 +; CHECK-NEXT: .LBB0_19: ; %DummyReturnBlock ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 06213ef3e06ea..9150bd0dfcd30 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -5396,759 +5396,762 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execz .LBB5_3 ; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0x800 ; CHECK-NEXT: .LBB5_2: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo -; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224 -; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240 -; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[96:97] offset:192 -; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[96:97] offset:208 -; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[96:97] offset:160 -; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[96:97] offset:176 -; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[96:97] offset:128 -; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[96:97] offset:144 -; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[96:97] offset:96 -; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[96:97] offset:112 -; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[96:97] offset:64 -; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[96:97] offset:80 -; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[96:97] offset:32 -; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[96:97] offset:48 -; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[96:97] -; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] offset:16 -; CHECK-NEXT: s_add_u32 s4, s4, 0x100 -; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:224 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] offset:240 +; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[2:3] offset:192 +; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[2:3] offset:208 +; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[2:3] offset:160 +; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[2:3] offset:176 +; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[2:3] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[2:3] offset:144 +; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[2:3] offset:96 +; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[2:3] offset:112 +; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[2:3] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[2:3] offset:80 +; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[2:3] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[2:3] offset:48 +; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[2:3] +; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[2:3] offset:16 +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: s_addc_u32 s5, s5, -1 ; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:224 ; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[12:15] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:208 ; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[20:23] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[24:27] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[28:31] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[32:35] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[36:39] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[48:51] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[52:55] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[64:67] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[68:71] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[80:83] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[84:87] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 -; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[96:99] offset:16 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB5_2 -; CHECK-NEXT: .LBB5_3: ; %Flow5 -; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6 +; CHECK-NEXT: .LBB5_3: ; %Flow15 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6 ; CHECK-NEXT: s_cbranch_execz .LBB5_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader -; CHECK-NEXT: s_movk_i32 s6, 0xff00 -; CHECK-NEXT: s_mov_b64 s[4:5], 0x700 -; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0x700, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x700, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; CHECK-NEXT: s_movk_i32 s4, 0xf800 +; CHECK-NEXT: s_mov_b32 s5, -1 ; CHECK-NEXT: .LBB5_5: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo -; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224 -; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240 -; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[96:97] offset:192 -; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[96:97] offset:208 -; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[96:97] offset:160 -; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[96:97] offset:176 -; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[96:97] offset:128 -; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[96:97] offset:144 -; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[96:97] offset:96 -; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[96:97] offset:112 -; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[96:97] offset:64 -; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[96:97] offset:80 -; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[96:97] offset:32 -; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[96:97] offset:48 -; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[96:97] -; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] offset:16 -; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 -; CHECK-NEXT: s_addc_u32 s5, s5, -1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:224 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] offset:240 +; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[2:3] offset:192 +; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[2:3] offset:208 +; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[2:3] offset:160 +; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[2:3] offset:176 +; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[2:3] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[2:3] offset:144 +; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[2:3] offset:96 +; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[2:3] offset:112 +; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[2:3] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[2:3] offset:80 +; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[2:3] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[2:3] offset:48 +; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[2:3] +; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[2:3] offset:16 +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff00, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:224 ; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[12:15] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:208 ; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[20:23] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[24:27] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[28:31] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[32:35] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[36:39] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[48:51] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[52:55] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[64:67] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[68:71] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[80:83] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[84:87] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 -; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[96:99] offset:16 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffff00, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB5_5 -; CHECK-NEXT: .LBB5_6: ; %Flow6 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: .LBB5_6: ; %Flow16 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; ; ALIGNED-LABEL: memmove_p0_p0_sz2048: ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_mov_b32 s4, exec_lo ; ALIGNED-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 ; ALIGNED-NEXT: s_cbranch_execz .LBB5_3 ; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x800 ; ALIGNED-NEXT: .LBB5_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, s5, v3, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[20:21] offset:240 -; ALIGNED-NEXT: flat_load_dwordx4 v[22:25], v[20:21] offset:224 -; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[20:21] -; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[20:21] offset:16 -; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[20:21] offset:32 -; ALIGNED-NEXT: flat_load_dwordx4 v[98:101], v[20:21] offset:48 -; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[20:21] offset:64 -; ALIGNED-NEXT: flat_load_dwordx4 v[82:85], v[20:21] offset:80 -; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[20:21] offset:96 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[20:21] offset:112 -; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[20:21] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[50:53], v[20:21] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[20:21] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[34:37], v[20:21] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[20:21] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[20:21] offset:208 +; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[2:3] offset:240 +; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[2:3] offset:224 +; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[2:3] +; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[2:3] offset:16 +; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[2:3] offset:32 +; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[2:3] offset:48 +; ALIGNED-NEXT: flat_load_dwordx4 v[84:87], v[2:3] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[80:83], v[2:3] offset:80 +; ALIGNED-NEXT: flat_load_dwordx4 v[68:71], v[2:3] offset:96 +; ALIGNED-NEXT: flat_load_dwordx4 v[64:67], v[2:3] offset:112 +; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[2:3] offset:128 +; ALIGNED-NEXT: flat_load_dwordx4 v[48:51], v[2:3] offset:144 +; ALIGNED-NEXT: flat_load_dwordx4 v[36:39], v[2:3] offset:160 +; ALIGNED-NEXT: flat_load_dwordx4 v[32:35], v[2:3] offset:176 +; ALIGNED-NEXT: flat_load_dwordx4 v[28:31], v[2:3] offset:192 +; ALIGNED-NEXT: flat_load_dwordx4 v[24:27], v[2:3] offset:208 +; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 +; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo -; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 -; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:254 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:252 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v19 offset:254 +; ALIGNED-NEXT: flat_store_byte v[0:1], v19 offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:250 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:248 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v18 offset:250 +; ALIGNED-NEXT: flat_store_byte v[0:1], v18 offset:248 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:246 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v17 offset:246 +; ALIGNED-NEXT: flat_store_byte v[0:1], v17 offset:244 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:242 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:240 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v16 offset:242 +; ALIGNED-NEXT: flat_store_byte v[0:1], v16 offset:240 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:238 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v23 offset:238 +; ALIGNED-NEXT: flat_store_byte v[0:1], v23 offset:236 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:234 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v22 offset:234 +; ALIGNED-NEXT: flat_store_byte v[0:1], v22 offset:232 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:230 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v21 offset:230 +; ALIGNED-NEXT: flat_store_byte v[0:1], v21 offset:228 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:226 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:224 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v20 offset:226 +; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:224 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:222 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:220 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:218 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:216 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:214 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:212 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:210 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:208 -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:206 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:202 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:198 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:194 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:192 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:190 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:186 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:182 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:178 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:176 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:174 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:172 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:170 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:168 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:166 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:164 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:162 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:160 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:158 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:154 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:152 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:150 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:148 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:146 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:144 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:142 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:138 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:134 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:130 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:128 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v27 offset:222 +; ALIGNED-NEXT: flat_store_byte v[0:1], v27 offset:220 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:122 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:120 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v26 offset:218 +; ALIGNED-NEXT: flat_store_byte v[0:1], v26 offset:216 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:118 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v25 offset:214 +; ALIGNED-NEXT: flat_store_byte v[0:1], v25 offset:212 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:114 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:112 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v24 offset:210 +; ALIGNED-NEXT: flat_store_byte v[0:1], v24 offset:208 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:110 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:106 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:102 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:96 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:94 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:92 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:90 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:88 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:84 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:80 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:78 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:76 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v31 offset:206 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:204 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:74 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:72 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v30 offset:202 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:200 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v29 offset:198 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:196 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:64 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v28 offset:194 +; ALIGNED-NEXT: flat_store_byte v[0:1], v28 offset:192 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:62 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:60 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:58 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:56 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:52 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v35 offset:190 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v34 offset:186 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v33 offset:182 +; ALIGNED-NEXT: flat_store_byte v[0:1], v33 offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v32 offset:178 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v39 offset:174 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:172 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v38 offset:170 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:168 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v37 offset:166 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:164 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v36 offset:162 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:160 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:24 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:20 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v51 offset:158 +; ALIGNED-NEXT: flat_store_byte v[0:1], v51 offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v50 offset:154 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v49 offset:150 +; ALIGNED-NEXT: flat_store_byte v[0:1], v49 offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v48 offset:146 +; ALIGNED-NEXT: flat_store_byte v[0:1], v48 offset:144 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v29 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v55 offset:142 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v54 offset:138 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v53 offset:134 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:132 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v52 offset:130 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:128 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v67 offset:126 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v66 offset:122 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v65 offset:118 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v64 offset:114 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:112 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v71 offset:110 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v70 offset:106 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v69 offset:102 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:100 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v68 offset:98 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:96 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v83 offset:94 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v82 offset:90 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:88 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v81 offset:86 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:84 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v80 offset:82 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:80 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v87 offset:78 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:76 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v86 offset:74 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:72 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v85 offset:70 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:68 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v84 offset:66 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:64 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v99 offset:62 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:60 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v98 offset:58 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:56 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v97 offset:54 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v96 offset:50 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:48 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v15 offset:42 +; ALIGNED-NEXT: flat_store_byte v[0:1], v15 offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v14 offset:46 +; ALIGNED-NEXT: flat_store_byte v[0:1], v14 offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v13 offset:34 +; ALIGNED-NEXT: flat_store_byte v[0:1], v13 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v12 offset:38 +; ALIGNED-NEXT: flat_store_byte v[0:1], v12 offset:36 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v11 offset:30 +; ALIGNED-NEXT: flat_store_byte v[0:1], v11 offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:26 +; ALIGNED-NEXT: flat_store_byte v[0:1], v10 offset:24 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:22 +; ALIGNED-NEXT: flat_store_byte v[0:1], v9 offset:20 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:18 +; ALIGNED-NEXT: flat_store_byte v[0:1], v8 offset:16 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v27 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:245 +; ALIGNED-NEXT: flat_store_byte v[0:1], v19 offset:253 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v26 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 +; ALIGNED-NEXT: flat_store_byte v[0:1], v27 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: flat_store_byte v[0:1], v18 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[0:1], v17 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[0:1], v16 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 +; ALIGNED-NEXT: flat_store_byte v[0:1], v23 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:235 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[0:1], v22 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v32 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: flat_store_byte v[0:1], v21 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:227 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v37 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v36 +; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v36 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v34 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v49 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[0:1], v19 offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v49 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[0:1], v26 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v48 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v65 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v8 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v54 +; ALIGNED-NEXT: flat_store_byte v[0:1], v25 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v54 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v69 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[0:1], v18 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[0:1], v24 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:197 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:205 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v66 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[0:1], v17 offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v70 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[0:1], v16 offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[0:1], v28 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v83 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:181 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:189 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v82 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v87 +; ALIGNED-NEXT: flat_store_byte v[0:1], v23 offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v86 +; ALIGNED-NEXT: flat_store_byte v[0:1], v33 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v86 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v99 +; ALIGNED-NEXT: flat_store_byte v[0:1], v22 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v99 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:165 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:173 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v98 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[0:1], v21 offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v14 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:159 +; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:163 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v12 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v12 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v11 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v51 offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: flat_store_byte v[0:1], v27 offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:147 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:145 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:143 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:141 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:139 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:137 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:133 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:131 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:129 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:127 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:123 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:121 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:117 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:115 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:113 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:111 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:109 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:107 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:105 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:95 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:93 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:91 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:89 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:79 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:77 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:75 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:73 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:63 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:61 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:59 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:57 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:43 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:47 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 +; ALIGNED-NEXT: flat_store_byte v[0:1], v19 offset:151 +; ALIGNED-NEXT: flat_store_byte v[0:1], v49 offset:149 +; ALIGNED-NEXT: flat_store_byte v[0:1], v26 offset:147 +; ALIGNED-NEXT: flat_store_byte v[0:1], v48 offset:145 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:143 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:141 +; ALIGNED-NEXT: flat_store_byte v[0:1], v25 offset:139 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:137 +; ALIGNED-NEXT: flat_store_byte v[0:1], v18 offset:135 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:133 +; ALIGNED-NEXT: flat_store_byte v[0:1], v24 offset:131 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:129 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:127 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:125 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:123 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:121 +; ALIGNED-NEXT: flat_store_byte v[0:1], v17 offset:119 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:117 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:115 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:113 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:111 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:109 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:107 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:105 +; ALIGNED-NEXT: flat_store_byte v[0:1], v16 offset:103 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:101 +; ALIGNED-NEXT: flat_store_byte v[0:1], v28 offset:99 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:97 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:95 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:93 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:91 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:89 +; ALIGNED-NEXT: flat_store_byte v[0:1], v23 offset:87 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:85 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:83 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:81 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:79 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:77 +; ALIGNED-NEXT: flat_store_byte v[0:1], v33 offset:75 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:73 +; ALIGNED-NEXT: flat_store_byte v[0:1], v22 offset:71 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:69 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:67 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:65 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:63 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:61 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:59 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:57 +; ALIGNED-NEXT: flat_store_byte v[0:1], v21 offset:55 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:53 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:51 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:49 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:43 +; ALIGNED-NEXT: flat_store_byte v[0:1], v15 offset:41 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:47 +; ALIGNED-NEXT: flat_store_byte v[0:1], v14 offset:45 +; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:35 +; ALIGNED-NEXT: flat_store_byte v[0:1], v13 offset:33 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:39 +; ALIGNED-NEXT: flat_store_byte v[0:1], v12 offset:37 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:31 +; ALIGNED-NEXT: flat_store_byte v[0:1], v11 offset:29 +; ALIGNED-NEXT: flat_store_byte v[0:1], v51 offset:27 +; ALIGNED-NEXT: flat_store_byte v[0:1], v10 offset:25 +; ALIGNED-NEXT: flat_store_byte v[0:1], v27 offset:23 +; ALIGNED-NEXT: flat_store_byte v[0:1], v9 offset:21 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:19 +; ALIGNED-NEXT: flat_store_byte v[0:1], v8 offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v7 offset:14 +; ALIGNED-NEXT: flat_store_byte v[0:1], v7 offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v6 offset:10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v6 offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v5 offset:6 +; ALIGNED-NEXT: flat_store_byte v[0:1], v5 offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[0:1], v4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 @@ -6157,378 +6160,364 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:7 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:5 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:3 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 +; ALIGNED-NEXT: flat_store_byte v[0:1], v8 offset:15 +; ALIGNED-NEXT: flat_store_byte v[0:1], v7 offset:13 +; ALIGNED-NEXT: flat_store_byte v[0:1], v9 offset:11 +; ALIGNED-NEXT: flat_store_byte v[0:1], v6 offset:9 +; ALIGNED-NEXT: flat_store_byte v[0:1], v10 offset:7 +; ALIGNED-NEXT: flat_store_byte v[0:1], v5 offset:5 +; ALIGNED-NEXT: flat_store_byte v[0:1], v11 offset:3 +; ALIGNED-NEXT: flat_store_byte v[0:1], v4 offset:1 +; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; ALIGNED-NEXT: s_cbranch_scc1 .LBB5_2 -; ALIGNED-NEXT: .LBB5_3: ; %Flow5 -; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 +; ALIGNED-NEXT: .LBB5_3: ; %Flow15 +; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6 ; ALIGNED-NEXT: s_cbranch_execz .LBB5_6 ; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader -; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 -; ALIGNED-NEXT: s_mov_b32 s7, -1 +; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x700, v2 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v14, vcc_lo, 0x700, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, vcc_lo +; ALIGNED-NEXT: s_movk_i32 s4, 0xf800 +; ALIGNED-NEXT: s_mov_b32 s5, -1 ; ALIGNED-NEXT: .LBB5_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:240 -; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:224 -; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[24:25] -; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:16 -; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[24:25] offset:32 -; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] offset:48 -; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[24:25] offset:64 -; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[24:25] offset:80 -; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[24:25] offset:96 -; ALIGNED-NEXT: flat_load_dwordx4 v[32:35], v[24:25] offset:112 -; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[24:25] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[24:25] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[81:84], v[24:25] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[24:25] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[24:25] offset:208 +; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[12:13] offset:240 +; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[12:13] offset:224 +; ALIGNED-NEXT: flat_load_dwordx4 v[0:3], v[12:13] +; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[12:13] offset:16 +; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[12:13] offset:32 +; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[12:13] offset:48 +; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[12:13] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[12:13] offset:80 +; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[12:13] offset:96 +; ALIGNED-NEXT: flat_load_dwordx4 v[27:30], v[12:13] offset:112 +; ALIGNED-NEXT: flat_load_dwordx4 v[34:37], v[12:13] offset:128 +; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[12:13] offset:144 +; ALIGNED-NEXT: flat_load_dwordx4 v[54:57], v[12:13] offset:160 +; ALIGNED-NEXT: flat_load_dwordx4 v[68:71], v[12:13] offset:176 +; ALIGNED-NEXT: flat_load_dwordx4 v[82:85], v[12:13] offset:192 +; ALIGNED-NEXT: flat_load_dwordx4 v[64:67], v[12:13] offset:208 +; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0xffffff00, v12 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:348 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo -; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 -; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:254 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:252 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v33 offset:254 +; ALIGNED-NEXT: flat_store_byte v[14:15], v33 offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:250 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:248 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v32 offset:250 +; ALIGNED-NEXT: flat_store_byte v[14:15], v32 offset:248 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:246 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v31 offset:246 +; ALIGNED-NEXT: flat_store_byte v[14:15], v31 offset:244 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:242 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:240 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v26 offset:242 +; ALIGNED-NEXT: flat_store_byte v[14:15], v26 offset:240 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:364 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:348 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:352 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:238 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v53 offset:238 +; ALIGNED-NEXT: flat_store_byte v[14:15], v53 offset:236 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:234 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v52 offset:234 +; ALIGNED-NEXT: flat_store_byte v[14:15], v52 offset:232 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:230 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v51 offset:230 +; ALIGNED-NEXT: flat_store_byte v[14:15], v51 offset:228 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:226 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:224 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v50 offset:226 +; ALIGNED-NEXT: flat_store_byte v[14:15], v50 offset:224 ; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:222 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:220 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:218 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:216 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:214 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:212 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:210 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:208 -; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:206 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:202 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:198 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:194 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:192 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v81 offset:222 +; ALIGNED-NEXT: flat_store_byte v[14:15], v81 offset:220 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v80 offset:218 +; ALIGNED-NEXT: flat_store_byte v[14:15], v80 offset:216 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v67 offset:214 +; ALIGNED-NEXT: flat_store_byte v[14:15], v67 offset:212 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v66 offset:210 +; ALIGNED-NEXT: flat_store_byte v[14:15], v66 offset:208 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:332 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:320 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:190 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:188 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v98 offset:206 +; ALIGNED-NEXT: flat_store_byte v[14:15], v98 offset:204 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:186 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:184 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v87 offset:202 +; ALIGNED-NEXT: flat_store_byte v[14:15], v87 offset:200 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:182 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v86 offset:198 +; ALIGNED-NEXT: flat_store_byte v[14:15], v86 offset:196 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:178 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:176 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v82 offset:194 +; ALIGNED-NEXT: flat_store_byte v[14:15], v82 offset:192 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:412 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:174 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:172 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:170 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:168 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:166 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:164 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:162 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:160 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v99 offset:190 +; ALIGNED-NEXT: flat_store_byte v[14:15], v99 offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v96 offset:186 +; ALIGNED-NEXT: flat_store_byte v[14:15], v96 offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v83 offset:182 +; ALIGNED-NEXT: flat_store_byte v[14:15], v83 offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v68 offset:178 +; ALIGNED-NEXT: flat_store_byte v[14:15], v68 offset:176 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:428 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:364 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:416 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:158 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:156 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v97 offset:174 +; ALIGNED-NEXT: flat_store_byte v[14:15], v97 offset:172 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:154 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:152 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v84 offset:170 +; ALIGNED-NEXT: flat_store_byte v[14:15], v84 offset:168 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:150 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v69 offset:166 +; ALIGNED-NEXT: flat_store_byte v[14:15], v69 offset:164 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:146 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:144 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v54 offset:162 +; ALIGNED-NEXT: flat_store_byte v[14:15], v54 offset:160 ; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368 ; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372 ; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 ; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:380 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:380 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:142 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:138 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:134 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:130 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:128 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v85 offset:158 +; ALIGNED-NEXT: flat_store_byte v[14:15], v85 offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v70 offset:154 +; ALIGNED-NEXT: flat_store_byte v[14:15], v70 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v55 offset:150 +; ALIGNED-NEXT: flat_store_byte v[14:15], v55 offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v38 offset:146 +; ALIGNED-NEXT: flat_store_byte v[14:15], v38 offset:144 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:396 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:122 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:120 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:118 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:116 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v71 offset:142 +; ALIGNED-NEXT: flat_store_byte v[14:15], v71 offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v64 offset:138 +; ALIGNED-NEXT: flat_store_byte v[14:15], v64 offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v39 offset:134 +; ALIGNED-NEXT: flat_store_byte v[14:15], v39 offset:132 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v34 offset:130 +; ALIGNED-NEXT: flat_store_byte v[14:15], v34 offset:128 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:476 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:110 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:106 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:102 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:96 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v65 offset:126 +; ALIGNED-NEXT: flat_store_byte v[14:15], v65 offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v48 offset:122 +; ALIGNED-NEXT: flat_store_byte v[14:15], v48 offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v35 offset:118 +; ALIGNED-NEXT: flat_store_byte v[14:15], v35 offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v27 offset:114 +; ALIGNED-NEXT: flat_store_byte v[14:15], v27 offset:112 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:492 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:480 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:94 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:92 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v49 offset:110 +; ALIGNED-NEXT: flat_store_byte v[14:15], v49 offset:108 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:90 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:88 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v36 offset:106 +; ALIGNED-NEXT: flat_store_byte v[14:15], v36 offset:104 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v28 offset:102 +; ALIGNED-NEXT: flat_store_byte v[14:15], v28 offset:100 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:80 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v22 offset:98 +; ALIGNED-NEXT: flat_store_byte v[14:15], v22 offset:96 ; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:432 ; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:436 ; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:440 ; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:444 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:436 ; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:432 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:78 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:76 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v37 offset:94 +; ALIGNED-NEXT: flat_store_byte v[14:15], v37 offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:74 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:72 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v29 offset:90 +; ALIGNED-NEXT: flat_store_byte v[14:15], v29 offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v23 offset:86 +; ALIGNED-NEXT: flat_store_byte v[14:15], v23 offset:84 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:64 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v19 offset:82 +; ALIGNED-NEXT: flat_store_byte v[14:15], v19 offset:80 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:460 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:524 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:62 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:60 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v30 offset:78 +; ALIGNED-NEXT: flat_store_byte v[14:15], v30 offset:76 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:58 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:56 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v24 offset:74 +; ALIGNED-NEXT: flat_store_byte v[14:15], v24 offset:72 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v20 offset:70 +; ALIGNED-NEXT: flat_store_byte v[14:15], v20 offset:68 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:532 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v17 offset:66 +; ALIGNED-NEXT: flat_store_byte v[14:15], v17 offset:64 +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:540 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v25 offset:62 +; ALIGNED-NEXT: flat_store_byte v[14:15], v25 offset:60 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v21 offset:58 +; ALIGNED-NEXT: flat_store_byte v[14:15], v21 offset:56 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v18 offset:54 +; ALIGNED-NEXT: flat_store_byte v[14:15], v18 offset:52 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v16 offset:50 +; ALIGNED-NEXT: flat_store_byte v[14:15], v16 offset:48 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:556 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:556 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:548 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v11 offset:42 +; ALIGNED-NEXT: flat_store_byte v[14:15], v11 offset:40 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:24 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v10 offset:46 +; ALIGNED-NEXT: flat_store_byte v[14:15], v10 offset:44 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:20 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v9 offset:34 +; ALIGNED-NEXT: flat_store_byte v[14:15], v9 offset:32 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v8 offset:38 +; ALIGNED-NEXT: flat_store_byte v[14:15], v8 offset:36 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:500 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 @@ -6538,274 +6527,295 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:504 ; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v7 offset:30 +; ALIGNED-NEXT: flat_store_byte v[14:15], v7 offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v6 offset:26 +; ALIGNED-NEXT: flat_store_byte v[14:15], v6 offset:24 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v5 offset:22 +; ALIGNED-NEXT: flat_store_byte v[14:15], v5 offset:20 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v4 offset:18 +; ALIGNED-NEXT: flat_store_byte v[14:15], v4 offset:16 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: flat_store_byte v[14:15], v100 offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[14:15], v33 offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[14:15], v101 offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[14:15], v81 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v70 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 +; ALIGNED-NEXT: flat_store_byte v[14:15], v32 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; ALIGNED-NEXT: flat_store_byte v[14:15], v102 offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: flat_store_byte v[14:15], v31 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v87 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 +; ALIGNED-NEXT: flat_store_byte v[14:15], v103 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v86 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 +; ALIGNED-NEXT: flat_store_byte v[14:15], v26 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[14:15], v112 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 +; ALIGNED-NEXT: flat_store_byte v[14:15], v53 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v96 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[14:15], v113 offset:235 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v83 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[14:15], v52 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v68 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[14:15], v114 offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: flat_store_byte v[14:15], v51 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[14:15], v115 offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v69 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 +; ALIGNED-NEXT: flat_store_byte v[14:15], v50 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v54 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[14:15], v100 offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[14:15], v33 offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v55 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 +; ALIGNED-NEXT: flat_store_byte v[14:15], v80 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[14:15], v101 offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[14:15], v70 offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[14:15], v67 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[14:15], v32 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v39 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 +; ALIGNED-NEXT: flat_store_byte v[14:15], v66 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v34 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[14:15], v102 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[14:15], v98 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[14:15], v31 offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v35 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[14:15], v87 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[14:15], v103 offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[14:15], v86 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[14:15], v26 offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v28 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 +; ALIGNED-NEXT: flat_store_byte v[14:15], v82 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v22 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 +; ALIGNED-NEXT: flat_store_byte v[14:15], v112 offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[14:15], v99 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v29 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 +; ALIGNED-NEXT: flat_store_byte v[14:15], v53 offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v23 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 +; ALIGNED-NEXT: flat_store_byte v[14:15], v96 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: flat_store_byte v[14:15], v113 offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[14:15], v83 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: flat_store_byte v[14:15], v52 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v20 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: flat_store_byte v[14:15], v68 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; ALIGNED-NEXT: flat_store_byte v[14:15], v114 offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: flat_store_byte v[14:15], v97 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: flat_store_byte v[14:15], v51 offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v18 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[14:15], v84 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; ALIGNED-NEXT: flat_store_byte v[14:15], v115 offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v11 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[14:15], v69 offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: flat_store_byte v[14:15], v50 offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:147 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:145 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:143 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:141 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:139 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:137 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:133 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:131 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:129 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:127 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:121 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:117 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:115 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:113 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:111 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:109 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:107 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:105 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:95 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:93 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:91 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:89 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:79 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:77 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:75 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:73 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:63 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:61 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:59 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:57 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:43 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:47 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 +; ALIGNED-NEXT: flat_store_byte v[14:15], v54 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[14:15], v100 offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; ALIGNED-NEXT: flat_store_byte v[14:15], v85 offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v6 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 8, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; ALIGNED-NEXT: flat_store_byte v[14:15], v81 offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:7 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:5 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:3 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 +; ALIGNED-NEXT: flat_store_byte v[14:15], v33 offset:151 +; ALIGNED-NEXT: flat_store_byte v[14:15], v55 offset:149 +; ALIGNED-NEXT: flat_store_byte v[14:15], v80 offset:147 +; ALIGNED-NEXT: flat_store_byte v[14:15], v38 offset:145 +; ALIGNED-NEXT: flat_store_byte v[14:15], v101 offset:143 +; ALIGNED-NEXT: flat_store_byte v[14:15], v71 offset:141 +; ALIGNED-NEXT: flat_store_byte v[14:15], v67 offset:139 +; ALIGNED-NEXT: flat_store_byte v[14:15], v64 offset:137 +; ALIGNED-NEXT: flat_store_byte v[14:15], v32 offset:135 +; ALIGNED-NEXT: flat_store_byte v[14:15], v39 offset:133 +; ALIGNED-NEXT: flat_store_byte v[14:15], v66 offset:131 +; ALIGNED-NEXT: flat_store_byte v[14:15], v34 offset:129 +; ALIGNED-NEXT: flat_store_byte v[14:15], v102 offset:127 +; ALIGNED-NEXT: flat_store_byte v[14:15], v65 offset:125 +; ALIGNED-NEXT: flat_store_byte v[14:15], v98 offset:123 +; ALIGNED-NEXT: flat_store_byte v[14:15], v48 offset:121 +; ALIGNED-NEXT: flat_store_byte v[14:15], v31 offset:119 +; ALIGNED-NEXT: flat_store_byte v[14:15], v35 offset:117 +; ALIGNED-NEXT: flat_store_byte v[14:15], v87 offset:115 +; ALIGNED-NEXT: flat_store_byte v[14:15], v27 offset:113 +; ALIGNED-NEXT: flat_store_byte v[14:15], v103 offset:111 +; ALIGNED-NEXT: flat_store_byte v[14:15], v49 offset:109 +; ALIGNED-NEXT: flat_store_byte v[14:15], v86 offset:107 +; ALIGNED-NEXT: flat_store_byte v[14:15], v36 offset:105 +; ALIGNED-NEXT: flat_store_byte v[14:15], v26 offset:103 +; ALIGNED-NEXT: flat_store_byte v[14:15], v28 offset:101 +; ALIGNED-NEXT: flat_store_byte v[14:15], v82 offset:99 +; ALIGNED-NEXT: flat_store_byte v[14:15], v22 offset:97 +; ALIGNED-NEXT: flat_store_byte v[14:15], v112 offset:95 +; ALIGNED-NEXT: flat_store_byte v[14:15], v37 offset:93 +; ALIGNED-NEXT: flat_store_byte v[14:15], v99 offset:91 +; ALIGNED-NEXT: flat_store_byte v[14:15], v29 offset:89 +; ALIGNED-NEXT: flat_store_byte v[14:15], v53 offset:87 +; ALIGNED-NEXT: flat_store_byte v[14:15], v23 offset:85 +; ALIGNED-NEXT: flat_store_byte v[14:15], v96 offset:83 +; ALIGNED-NEXT: flat_store_byte v[14:15], v19 offset:81 +; ALIGNED-NEXT: flat_store_byte v[14:15], v113 offset:79 +; ALIGNED-NEXT: flat_store_byte v[14:15], v30 offset:77 +; ALIGNED-NEXT: flat_store_byte v[14:15], v83 offset:75 +; ALIGNED-NEXT: flat_store_byte v[14:15], v24 offset:73 +; ALIGNED-NEXT: flat_store_byte v[14:15], v52 offset:71 +; ALIGNED-NEXT: flat_store_byte v[14:15], v20 offset:69 +; ALIGNED-NEXT: flat_store_byte v[14:15], v68 offset:67 +; ALIGNED-NEXT: flat_store_byte v[14:15], v17 offset:65 +; ALIGNED-NEXT: flat_store_byte v[14:15], v114 offset:63 +; ALIGNED-NEXT: flat_store_byte v[14:15], v25 offset:61 +; ALIGNED-NEXT: flat_store_byte v[14:15], v97 offset:59 +; ALIGNED-NEXT: flat_store_byte v[14:15], v21 offset:57 +; ALIGNED-NEXT: flat_store_byte v[14:15], v51 offset:55 +; ALIGNED-NEXT: flat_store_byte v[14:15], v18 offset:53 +; ALIGNED-NEXT: flat_store_byte v[14:15], v84 offset:51 +; ALIGNED-NEXT: flat_store_byte v[14:15], v16 offset:49 +; ALIGNED-NEXT: flat_store_byte v[14:15], v115 offset:43 +; ALIGNED-NEXT: flat_store_byte v[14:15], v11 offset:41 +; ALIGNED-NEXT: flat_store_byte v[14:15], v69 offset:47 +; ALIGNED-NEXT: flat_store_byte v[14:15], v10 offset:45 +; ALIGNED-NEXT: flat_store_byte v[14:15], v50 offset:35 +; ALIGNED-NEXT: flat_store_byte v[14:15], v9 offset:33 +; ALIGNED-NEXT: flat_store_byte v[14:15], v54 offset:39 +; ALIGNED-NEXT: flat_store_byte v[14:15], v8 offset:37 +; ALIGNED-NEXT: flat_store_byte v[14:15], v100 offset:31 +; ALIGNED-NEXT: flat_store_byte v[14:15], v7 offset:29 +; ALIGNED-NEXT: flat_store_byte v[14:15], v85 offset:27 +; ALIGNED-NEXT: flat_store_byte v[14:15], v6 offset:25 +; ALIGNED-NEXT: flat_store_byte v[14:15], v81 offset:23 +; ALIGNED-NEXT: flat_store_byte v[14:15], v5 offset:21 +; ALIGNED-NEXT: flat_store_byte v[14:15], v70 offset:19 +; ALIGNED-NEXT: flat_store_byte v[14:15], v4 offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v3 offset:14 +; ALIGNED-NEXT: flat_store_byte v[14:15], v3 offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v2 offset:10 +; ALIGNED-NEXT: flat_store_byte v[14:15], v2 offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v1 offset:6 +; ALIGNED-NEXT: flat_store_byte v[14:15], v1 offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[14:15], v0 offset:2 +; ALIGNED-NEXT: flat_store_byte v[14:15], v0 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 8, v3 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; ALIGNED-NEXT: flat_store_byte v[14:15], v4 offset:15 +; ALIGNED-NEXT: flat_store_byte v[14:15], v3 offset:13 +; ALIGNED-NEXT: flat_store_byte v[14:15], v5 offset:11 +; ALIGNED-NEXT: flat_store_byte v[14:15], v2 offset:9 +; ALIGNED-NEXT: flat_store_byte v[14:15], v6 offset:7 +; ALIGNED-NEXT: flat_store_byte v[14:15], v1 offset:5 +; ALIGNED-NEXT: flat_store_byte v[14:15], v7 offset:3 +; ALIGNED-NEXT: flat_store_byte v[14:15], v0 offset:1 +; ALIGNED-NEXT: v_add_co_u32 v14, vcc_lo, 0xffffff00, v14 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v15, null, -1, v15, vcc_lo ; ALIGNED-NEXT: s_cbranch_scc0 .LBB5_5 -; ALIGNED-NEXT: .LBB5_6: ; %Flow6 -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: .LBB5_6: ; %Flow16 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; ALIGNED-NEXT: s_clause 0x9 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -6817,27 +6827,31 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s4 ; UNROLL3-NEXT: s_cbranch_execz .LBB5_4 ; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: v_mov_b32_e32 v5, v3 +; UNROLL3-NEXT: v_mov_b32_e32 v7, v1 +; UNROLL3-NEXT: v_mov_b32_e32 v4, v2 +; UNROLL3-NEXT: v_mov_b32_e32 v6, v0 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7e0 ; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB5_2: ; %memmove_fwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 -; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo -; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; UNROLL3-NEXT: s_clause 0x2 -; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[12:13] -; UNROLL3-NEXT: flat_load_dwordx4 v[8:11], v[12:13] offset:16 -; UNROLL3-NEXT: flat_load_dwordx4 v[12:15], v[12:13] offset:32 -; UNROLL3-NEXT: s_add_u32 s4, s4, 48 -; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: flat_load_dwordx4 v[8:11], v[4:5] +; UNROLL3-NEXT: flat_load_dwordx4 v[12:15], v[4:5] offset:16 +; UNROLL3-NEXT: flat_load_dwordx4 v[16:19], v[4:5] offset:32 +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, v4, 48 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 +; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 ; UNROLL3-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[8:11] ; UNROLL3-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[12:15] offset:16 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 -; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0x7e0 +; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[16:19] offset:32 +; UNROLL3-NEXT: v_add_co_u32 v6, vcc_lo, v6, 48 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc1 .LBB5_2 ; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual ; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:2016 @@ -6846,44 +6860,47 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; UNROLL3-NEXT: flat_load_dwordx4 v[2:5], v[2:3] offset:2032 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:2032 -; UNROLL3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; UNROLL3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; UNROLL3-NEXT: .LBB5_4: ; %Flow3 -; UNROLL3-NEXT: s_andn2_saveexec_b32 s8, s6 +; UNROLL3-NEXT: ; implicit-def: $vgpr2 +; UNROLL3-NEXT: ; implicit-def: $vgpr0 +; UNROLL3-NEXT: .LBB5_4: ; %Flow13 +; UNROLL3-NEXT: s_andn2_saveexec_b32 s6, s6 ; UNROLL3-NEXT: s_cbranch_execz .LBB5_7 ; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual ; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:2032 -; UNROLL3-NEXT: s_movk_i32 s6, 0xffd0 -; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7b0 -; UNROLL3-NEXT: s_mov_b32 s7, -1 +; UNROLL3-NEXT: s_movk_i32 s4, 0xf820 +; UNROLL3-NEXT: s_mov_b32 s5, -1 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2032 -; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:2016 +; UNROLL3-NEXT: flat_load_dwordx4 v[6:9], v[2:3] offset:2016 +; UNROLL3-NEXT: v_add_co_u32 v2, vcc_lo, 0x7b0, v2 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0x7b0, v0 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo ; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2016 +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:2016 ; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB5_6: ; %memmove_bwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 -; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo -; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; UNROLL3-NEXT: s_clause 0x2 -; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[12:13] -; UNROLL3-NEXT: flat_load_dwordx4 v[8:11], v[12:13] offset:16 -; UNROLL3-NEXT: flat_load_dwordx4 v[12:15], v[12:13] offset:32 -; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 -; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 +; UNROLL3-NEXT: flat_load_dwordx4 v[6:9], v[2:3] +; UNROLL3-NEXT: flat_load_dwordx4 v[10:13], v[2:3] offset:16 +; UNROLL3-NEXT: flat_load_dwordx4 v[14:17], v[2:3] offset:32 +; UNROLL3-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffffd0, v2 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 ; UNROLL3-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[6:9] ; UNROLL3-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[10:13] offset:16 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 -; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[14:17] offset:32 +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffffd0, v4 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, vcc_lo +; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc0 .LBB5_6 -; UNROLL3-NEXT: .LBB5_7: ; %Flow4 -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; UNROLL3-NEXT: .LBB5_7: ; %Flow14 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) ; UNROLL3-NEXT: s_setpc_b64 s[30:31] entry: @@ -6900,756 +6917,759 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execz .LBB6_3 ; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0x800 ; CHECK-NEXT: .LBB6_2: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo -; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:224 -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:240 -; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:192 -; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:208 -; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:160 -; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:176 -; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:128 -; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:144 -; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:96 -; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:112 -; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:64 -; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:80 -; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:32 -; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:48 -; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off -; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off offset:16 -; CHECK-NEXT: s_add_u32 s4, s4, 0x100 -; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:224 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:240 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:192 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:208 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:160 +; CHECK-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:176 +; CHECK-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:128 +; CHECK-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:144 +; CHECK-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:96 +; CHECK-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:112 +; CHECK-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:64 +; CHECK-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:80 +; CHECK-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:32 +; CHECK-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:48 +; CHECK-NEXT: global_load_dwordx4 v[84:87], v[2:3], off +; CHECK-NEXT: global_load_dwordx4 v[96:99], v[2:3], off offset:16 +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: s_addc_u32 s5, s5, -1 ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[4:7], off offset:224 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:224 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[8:11], off offset:240 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[12:15], off offset:192 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[12:15], off offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[16:19], off offset:208 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[16:19], off offset:208 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[20:23], off offset:160 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[20:23], off offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[24:27], off offset:176 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[24:27], off offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[28:31], off offset:128 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[28:31], off offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[32:35], off offset:144 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[32:35], off offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[36:39], off offset:96 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[36:39], off offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[48:51], off offset:112 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[48:51], off offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[52:55], off offset:64 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[52:55], off offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[64:67], off offset:80 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[64:67], off offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[68:71], off offset:32 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[68:71], off offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[80:83], off offset:48 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[80:83], off offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[84:87], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[84:87], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[96:99], off offset:16 -; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[96:99], off offset:16 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB6_2 -; CHECK-NEXT: .LBB6_3: ; %Flow7 -; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6 +; CHECK-NEXT: .LBB6_3: ; %Flow17 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6 ; CHECK-NEXT: s_cbranch_execz .LBB6_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader -; CHECK-NEXT: s_movk_i32 s6, 0xff00 -; CHECK-NEXT: s_mov_b64 s[4:5], 0x700 -; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0x700, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x700, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; CHECK-NEXT: s_movk_i32 s4, 0xf800 +; CHECK-NEXT: s_mov_b32 s5, -1 ; CHECK-NEXT: .LBB6_5: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo -; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:224 -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:240 -; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:192 -; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:208 -; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:160 -; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:176 -; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:128 -; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:144 -; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:96 -; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:112 -; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:64 -; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:80 -; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:32 -; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:48 -; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off -; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off offset:16 -; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 -; CHECK-NEXT: s_addc_u32 s5, s5, -1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:224 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:240 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:192 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:208 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:160 +; CHECK-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:176 +; CHECK-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:128 +; CHECK-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:144 +; CHECK-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:96 +; CHECK-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:112 +; CHECK-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:64 +; CHECK-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:80 +; CHECK-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:32 +; CHECK-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:48 +; CHECK-NEXT: global_load_dwordx4 v[84:87], v[2:3], off +; CHECK-NEXT: global_load_dwordx4 v[96:99], v[2:3], off offset:16 +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff00, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[4:7], off offset:224 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:224 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[8:11], off offset:240 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[12:15], off offset:192 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[12:15], off offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[16:19], off offset:208 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[16:19], off offset:208 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[20:23], off offset:160 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[20:23], off offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[24:27], off offset:176 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[24:27], off offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[28:31], off offset:128 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[28:31], off offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[32:35], off offset:144 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[32:35], off offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[36:39], off offset:96 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[36:39], off offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[48:51], off offset:112 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[48:51], off offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[52:55], off offset:64 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[52:55], off offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[64:67], off offset:80 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[64:67], off offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[68:71], off offset:32 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[68:71], off offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[80:83], off offset:48 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[80:83], off offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[84:87], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[84:87], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v[100:101], v[96:99], off offset:16 -; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[96:99], off offset:16 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffff00, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB6_5 -; CHECK-NEXT: .LBB6_6: ; %Flow8 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: .LBB6_6: ; %Flow18 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_setpc_b64 s[30:31] ; ; ALIGNED-LABEL: memmove_p1_p1_sz2048: ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_mov_b32 s4, exec_lo ; ALIGNED-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 ; ALIGNED-NEXT: s_cbranch_execz .LBB6_3 ; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x800 ; ALIGNED-NEXT: .LBB6_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, s5, v3, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[20:21], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[22:25], v[20:21], off offset:224 -; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[20:21], off -; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[20:21], off offset:16 -; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[20:21], off offset:32 -; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[20:21], off offset:48 -; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[20:21], off offset:64 -; ALIGNED-NEXT: global_load_dwordx4 v[82:85], v[20:21], off offset:80 -; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[20:21], off offset:96 -; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[20:21], off offset:112 -; ALIGNED-NEXT: global_load_dwordx4 v[40:43], v[20:21], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[50:53], v[20:21], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[20:21], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[34:37], v[20:21], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[30:33], v[20:21], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[20:21], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[2:3], off +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16 +; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:32 +; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[2:3], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[2:3], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:208 +; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 +; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo -; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 -; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v21, off offset:254 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:252 +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v19, off offset:254 +; ALIGNED-NEXT: global_store_byte v[0:1], v19, off offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v20, off offset:250 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:248 +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v18, off offset:250 +; ALIGNED-NEXT: global_store_byte v[0:1], v18, off offset:248 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v19, off offset:246 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:244 +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v17, off offset:246 +; ALIGNED-NEXT: global_store_byte v[0:1], v17, off offset:244 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v18, off offset:242 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:240 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v16, off offset:242 +; ALIGNED-NEXT: global_store_byte v[0:1], v16, off offset:240 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v23, off offset:238 +; ALIGNED-NEXT: global_store_byte v[0:1], v23, off offset:236 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v22, off offset:234 +; ALIGNED-NEXT: global_store_byte v[0:1], v22, off offset:232 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v21, off offset:230 +; ALIGNED-NEXT: global_store_byte v[0:1], v21, off offset:228 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v20, off offset:226 +; ALIGNED-NEXT: global_store_byte v[0:1], v20, off offset:224 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v21 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v20 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v25, off offset:238 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:236 +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v27, off offset:222 +; ALIGNED-NEXT: global_store_byte v[0:1], v27, off offset:220 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v24, off offset:234 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:232 +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v26, off offset:218 +; ALIGNED-NEXT: global_store_byte v[0:1], v26, off offset:216 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v23, off offset:230 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:228 +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v25, off offset:214 +; ALIGNED-NEXT: global_store_byte v[0:1], v25, off offset:212 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v22, off offset:226 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:224 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v29, off offset:222 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:220 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v28, off offset:218 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:216 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v27, off offset:214 -; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:212 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v26, off offset:210 -; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:208 -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v24, off offset:210 +; ALIGNED-NEXT: global_store_byte v[0:1], v24, off offset:208 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v33, off offset:206 -; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v32, off offset:202 -; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v31, off offset:198 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v30, off offset:194 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:192 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v31, off offset:206 +; ALIGNED-NEXT: global_store_byte v[0:1], v31, off offset:204 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v30, off offset:202 +; ALIGNED-NEXT: global_store_byte v[0:1], v30, off offset:200 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v29, off offset:198 +; ALIGNED-NEXT: global_store_byte v[0:1], v29, off offset:196 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v28, off offset:194 +; ALIGNED-NEXT: global_store_byte v[0:1], v28, off offset:192 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v37, off offset:190 -; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v36, off offset:186 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v35, off offset:182 -; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v34, off offset:178 -; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:176 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v35, off offset:190 +; ALIGNED-NEXT: global_store_byte v[0:1], v35, off offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v34, off offset:186 +; ALIGNED-NEXT: global_store_byte v[0:1], v34, off offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v33, off offset:182 +; ALIGNED-NEXT: global_store_byte v[0:1], v33, off offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v32, off offset:178 +; ALIGNED-NEXT: global_store_byte v[0:1], v32, off offset:176 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v49, off offset:174 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:172 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v48, off offset:170 -; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:168 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v39, off offset:166 -; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:164 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v38, off offset:162 -; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:160 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v39, off offset:174 +; ALIGNED-NEXT: global_store_byte v[0:1], v39, off offset:172 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v38, off offset:170 +; ALIGNED-NEXT: global_store_byte v[0:1], v38, off offset:168 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v37, off offset:166 +; ALIGNED-NEXT: global_store_byte v[0:1], v37, off offset:164 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v36, off offset:162 +; ALIGNED-NEXT: global_store_byte v[0:1], v36, off offset:160 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v53, off offset:158 -; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v52, off offset:154 -; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:152 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v51, off offset:150 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:148 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v50, off offset:146 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:144 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v51, off offset:158 +; ALIGNED-NEXT: global_store_byte v[0:1], v51, off offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v50, off offset:154 +; ALIGNED-NEXT: global_store_byte v[0:1], v50, off offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v49, off offset:150 +; ALIGNED-NEXT: global_store_byte v[0:1], v49, off offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v48, off offset:146 +; ALIGNED-NEXT: global_store_byte v[0:1], v48, off offset:144 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v65, off offset:142 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v64, off offset:138 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v55, off offset:134 -; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v54, off offset:130 -; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:128 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v55, off offset:142 +; ALIGNED-NEXT: global_store_byte v[0:1], v55, off offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v54, off offset:138 +; ALIGNED-NEXT: global_store_byte v[0:1], v54, off offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v53, off offset:134 +; ALIGNED-NEXT: global_store_byte v[0:1], v53, off offset:132 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v52, off offset:130 +; ALIGNED-NEXT: global_store_byte v[0:1], v52, off offset:128 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v69, off offset:126 -; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v68, off offset:122 -; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:120 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v67, off offset:118 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:116 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v66, off offset:114 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:112 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v67, off offset:126 +; ALIGNED-NEXT: global_store_byte v[0:1], v67, off offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v66, off offset:122 +; ALIGNED-NEXT: global_store_byte v[0:1], v66, off offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v65, off offset:118 +; ALIGNED-NEXT: global_store_byte v[0:1], v65, off offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v64, off offset:114 +; ALIGNED-NEXT: global_store_byte v[0:1], v64, off offset:112 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v81, off offset:110 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v80, off offset:106 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v71, off offset:102 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v70, off offset:98 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:96 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v71, off offset:110 +; ALIGNED-NEXT: global_store_byte v[0:1], v71, off offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v70, off offset:106 +; ALIGNED-NEXT: global_store_byte v[0:1], v70, off offset:104 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v69, off offset:102 +; ALIGNED-NEXT: global_store_byte v[0:1], v69, off offset:100 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v68, off offset:98 +; ALIGNED-NEXT: global_store_byte v[0:1], v68, off offset:96 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:188 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v85, off offset:94 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:92 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v84, off offset:90 -; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:88 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v83, off offset:86 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:84 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v82, off offset:82 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:80 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v83, off offset:94 +; ALIGNED-NEXT: global_store_byte v[0:1], v83, off offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v82, off offset:90 +; ALIGNED-NEXT: global_store_byte v[0:1], v82, off offset:88 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v81, off offset:86 +; ALIGNED-NEXT: global_store_byte v[0:1], v81, off offset:84 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v80, off offset:82 +; ALIGNED-NEXT: global_store_byte v[0:1], v80, off offset:80 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v97, off offset:78 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:76 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v96, off offset:74 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:72 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v87, off offset:70 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:68 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v86, off offset:66 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:64 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v87, off offset:78 +; ALIGNED-NEXT: global_store_byte v[0:1], v87, off offset:76 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v86, off offset:74 +; ALIGNED-NEXT: global_store_byte v[0:1], v86, off offset:72 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v85, off offset:70 +; ALIGNED-NEXT: global_store_byte v[0:1], v85, off offset:68 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v84, off offset:66 +; ALIGNED-NEXT: global_store_byte v[0:1], v84, off offset:64 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:284 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v101, off offset:62 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:60 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v100, off offset:58 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:56 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v99, off offset:54 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:52 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v98, off offset:50 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v99, off offset:62 +; ALIGNED-NEXT: global_store_byte v[0:1], v99, off offset:60 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v98, off offset:58 +; ALIGNED-NEXT: global_store_byte v[0:1], v98, off offset:56 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v97, off offset:54 +; ALIGNED-NEXT: global_store_byte v[0:1], v97, off offset:52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v96, off offset:50 +; ALIGNED-NEXT: global_store_byte v[0:1], v96, off offset:48 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:42 -; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:40 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:46 -; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:44 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:34 -; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:38 -; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v15, off offset:42 +; ALIGNED-NEXT: global_store_byte v[0:1], v15, off offset:40 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v14, off offset:46 +; ALIGNED-NEXT: global_store_byte v[0:1], v14, off offset:44 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v13, off offset:34 +; ALIGNED-NEXT: global_store_byte v[0:1], v13, off offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v12, off offset:38 +; ALIGNED-NEXT: global_store_byte v[0:1], v12, off offset:36 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v11, off offset:30 -; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:28 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v10, off offset:26 -; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:24 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v9, off offset:22 -; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:20 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v8, off offset:18 -; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v11, off offset:30 +; ALIGNED-NEXT: global_store_byte v[0:1], v11, off offset:28 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v10, off offset:26 +; ALIGNED-NEXT: global_store_byte v[0:1], v10, off offset:24 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v9, off offset:22 +; ALIGNED-NEXT: global_store_byte v[0:1], v9, off offset:20 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v8, off offset:18 +; ALIGNED-NEXT: global_store_byte v[0:1], v8, off offset:16 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v27 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v29 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: global_store_byte v[0:1], v100, off offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v27 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:245 +; ALIGNED-NEXT: global_store_byte v[0:1], v19, off offset:253 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v26 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:243 +; ALIGNED-NEXT: global_store_byte v[0:1], v101, off offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 +; ALIGNED-NEXT: global_store_byte v[0:1], v27, off offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: global_store_byte v[0:1], v18, off offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: global_store_byte v[0:1], v102, off offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: global_store_byte v[0:1], v17, off offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: global_store_byte v[0:1], v103, off offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: global_store_byte v[0:1], v16, off offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: global_store_byte v[0:1], v112, off offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 +; ALIGNED-NEXT: global_store_byte v[0:1], v23, off offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; ALIGNED-NEXT: global_store_byte v[0:1], v113, off offset:235 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v32 +; ALIGNED-NEXT: global_store_byte v[0:1], v22, off offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v32 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:235 +; ALIGNED-NEXT: global_store_byte v[0:1], v114, off offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: global_store_byte v[0:1], v21, off offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: global_store_byte v[0:1], v115, off offset:227 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v37 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v36 +; ALIGNED-NEXT: global_store_byte v[0:1], v20, off offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v36 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v34 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v49 +; ALIGNED-NEXT: global_store_byte v[0:1], v100, off offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: global_store_byte v[0:1], v19, off offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v49 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v48 +; ALIGNED-NEXT: global_store_byte v[0:1], v26, off offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v48 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v65 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v64 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v8 +; ALIGNED-NEXT: global_store_byte v[0:1], v101, off offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v55 +; ALIGNED-NEXT: global_store_byte v[0:1], v50, off offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v54 +; ALIGNED-NEXT: global_store_byte v[0:1], v25, off offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v54 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v69 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v67 +; ALIGNED-NEXT: global_store_byte v[0:1], v18, off offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: global_store_byte v[0:1], v24, off offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: global_store_byte v[0:1], v102, off offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:197 +; ALIGNED-NEXT: global_store_byte v[0:1], v31, off offset:205 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v66 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v71 +; ALIGNED-NEXT: global_store_byte v[0:1], v17, off offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: global_store_byte v[0:1], v30, off offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: global_store_byte v[0:1], v103, off offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v70 +; ALIGNED-NEXT: global_store_byte v[0:1], v29, off offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v70 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v83 +; ALIGNED-NEXT: global_store_byte v[0:1], v16, off offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 +; ALIGNED-NEXT: global_store_byte v[0:1], v28, off offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 +; ALIGNED-NEXT: global_store_byte v[0:1], v112, off offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v83 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:181 +; ALIGNED-NEXT: global_store_byte v[0:1], v35, off offset:189 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v82 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v87 +; ALIGNED-NEXT: global_store_byte v[0:1], v23, off offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: global_store_byte v[0:1], v34, off offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: global_store_byte v[0:1], v113, off offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v86 +; ALIGNED-NEXT: global_store_byte v[0:1], v33, off offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v86 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v99 +; ALIGNED-NEXT: global_store_byte v[0:1], v22, off offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: global_store_byte v[0:1], v32, off offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: global_store_byte v[0:1], v114, off offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v99 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:165 +; ALIGNED-NEXT: global_store_byte v[0:1], v39, off offset:173 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v98 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v15 +; ALIGNED-NEXT: global_store_byte v[0:1], v21, off offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: global_store_byte v[0:1], v38, off offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: global_store_byte v[0:1], v115, off offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v14 +; ALIGNED-NEXT: global_store_byte v[0:1], v37, off offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v14 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:159 +; ALIGNED-NEXT: global_store_byte v[0:1], v20, off offset:163 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v12 +; ALIGNED-NEXT: global_store_byte v[0:1], v36, off offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v12 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 +; ALIGNED-NEXT: global_store_byte v[0:1], v100, off offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v11 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; ALIGNED-NEXT: global_store_byte v[0:1], v51, off offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: global_store_byte v[0:1], v27, off offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:147 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:145 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:143 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:141 -; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:139 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:137 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:135 -; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:133 -; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:131 -; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:129 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:127 -; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:125 -; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:123 -; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:121 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:119 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:117 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:115 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:113 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:111 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:109 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:107 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:105 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:103 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:101 -; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:99 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:97 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:95 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:93 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:91 -; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:89 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:87 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:85 -; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:83 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:81 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:79 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:77 -; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:75 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:73 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:71 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:69 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:67 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:65 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:63 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:61 -; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:59 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:57 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:55 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:53 -; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:51 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:49 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:43 -; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:41 -; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:47 -; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:45 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:35 -; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:33 -; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:39 -; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:37 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:31 -; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:29 -; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:27 -; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:25 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:23 -; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:21 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:19 -; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v7, off offset:14 -; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v6, off offset:10 -; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v5, off offset:6 -; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v4, off offset:2 -; ALIGNED-NEXT: global_store_byte v[16:17], v4, off +; ALIGNED-NEXT: global_store_byte v[0:1], v19, off offset:151 +; ALIGNED-NEXT: global_store_byte v[0:1], v49, off offset:149 +; ALIGNED-NEXT: global_store_byte v[0:1], v26, off offset:147 +; ALIGNED-NEXT: global_store_byte v[0:1], v48, off offset:145 +; ALIGNED-NEXT: global_store_byte v[0:1], v101, off offset:143 +; ALIGNED-NEXT: global_store_byte v[0:1], v55, off offset:141 +; ALIGNED-NEXT: global_store_byte v[0:1], v25, off offset:139 +; ALIGNED-NEXT: global_store_byte v[0:1], v54, off offset:137 +; ALIGNED-NEXT: global_store_byte v[0:1], v18, off offset:135 +; ALIGNED-NEXT: global_store_byte v[0:1], v53, off offset:133 +; ALIGNED-NEXT: global_store_byte v[0:1], v24, off offset:131 +; ALIGNED-NEXT: global_store_byte v[0:1], v52, off offset:129 +; ALIGNED-NEXT: global_store_byte v[0:1], v102, off offset:127 +; ALIGNED-NEXT: global_store_byte v[0:1], v67, off offset:125 +; ALIGNED-NEXT: global_store_byte v[0:1], v31, off offset:123 +; ALIGNED-NEXT: global_store_byte v[0:1], v66, off offset:121 +; ALIGNED-NEXT: global_store_byte v[0:1], v17, off offset:119 +; ALIGNED-NEXT: global_store_byte v[0:1], v65, off offset:117 +; ALIGNED-NEXT: global_store_byte v[0:1], v30, off offset:115 +; ALIGNED-NEXT: global_store_byte v[0:1], v64, off offset:113 +; ALIGNED-NEXT: global_store_byte v[0:1], v103, off offset:111 +; ALIGNED-NEXT: global_store_byte v[0:1], v71, off offset:109 +; ALIGNED-NEXT: global_store_byte v[0:1], v29, off offset:107 +; ALIGNED-NEXT: global_store_byte v[0:1], v70, off offset:105 +; ALIGNED-NEXT: global_store_byte v[0:1], v16, off offset:103 +; ALIGNED-NEXT: global_store_byte v[0:1], v69, off offset:101 +; ALIGNED-NEXT: global_store_byte v[0:1], v28, off offset:99 +; ALIGNED-NEXT: global_store_byte v[0:1], v68, off offset:97 +; ALIGNED-NEXT: global_store_byte v[0:1], v112, off offset:95 +; ALIGNED-NEXT: global_store_byte v[0:1], v83, off offset:93 +; ALIGNED-NEXT: global_store_byte v[0:1], v35, off offset:91 +; ALIGNED-NEXT: global_store_byte v[0:1], v82, off offset:89 +; ALIGNED-NEXT: global_store_byte v[0:1], v23, off offset:87 +; ALIGNED-NEXT: global_store_byte v[0:1], v81, off offset:85 +; ALIGNED-NEXT: global_store_byte v[0:1], v34, off offset:83 +; ALIGNED-NEXT: global_store_byte v[0:1], v80, off offset:81 +; ALIGNED-NEXT: global_store_byte v[0:1], v113, off offset:79 +; ALIGNED-NEXT: global_store_byte v[0:1], v87, off offset:77 +; ALIGNED-NEXT: global_store_byte v[0:1], v33, off offset:75 +; ALIGNED-NEXT: global_store_byte v[0:1], v86, off offset:73 +; ALIGNED-NEXT: global_store_byte v[0:1], v22, off offset:71 +; ALIGNED-NEXT: global_store_byte v[0:1], v85, off offset:69 +; ALIGNED-NEXT: global_store_byte v[0:1], v32, off offset:67 +; ALIGNED-NEXT: global_store_byte v[0:1], v84, off offset:65 +; ALIGNED-NEXT: global_store_byte v[0:1], v114, off offset:63 +; ALIGNED-NEXT: global_store_byte v[0:1], v99, off offset:61 +; ALIGNED-NEXT: global_store_byte v[0:1], v39, off offset:59 +; ALIGNED-NEXT: global_store_byte v[0:1], v98, off offset:57 +; ALIGNED-NEXT: global_store_byte v[0:1], v21, off offset:55 +; ALIGNED-NEXT: global_store_byte v[0:1], v97, off offset:53 +; ALIGNED-NEXT: global_store_byte v[0:1], v38, off offset:51 +; ALIGNED-NEXT: global_store_byte v[0:1], v96, off offset:49 +; ALIGNED-NEXT: global_store_byte v[0:1], v115, off offset:43 +; ALIGNED-NEXT: global_store_byte v[0:1], v15, off offset:41 +; ALIGNED-NEXT: global_store_byte v[0:1], v37, off offset:47 +; ALIGNED-NEXT: global_store_byte v[0:1], v14, off offset:45 +; ALIGNED-NEXT: global_store_byte v[0:1], v20, off offset:35 +; ALIGNED-NEXT: global_store_byte v[0:1], v13, off offset:33 +; ALIGNED-NEXT: global_store_byte v[0:1], v36, off offset:39 +; ALIGNED-NEXT: global_store_byte v[0:1], v12, off offset:37 +; ALIGNED-NEXT: global_store_byte v[0:1], v100, off offset:31 +; ALIGNED-NEXT: global_store_byte v[0:1], v11, off offset:29 +; ALIGNED-NEXT: global_store_byte v[0:1], v51, off offset:27 +; ALIGNED-NEXT: global_store_byte v[0:1], v10, off offset:25 +; ALIGNED-NEXT: global_store_byte v[0:1], v27, off offset:23 +; ALIGNED-NEXT: global_store_byte v[0:1], v9, off offset:21 +; ALIGNED-NEXT: global_store_byte v[0:1], v50, off offset:19 +; ALIGNED-NEXT: global_store_byte v[0:1], v8, off offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v7, off offset:14 +; ALIGNED-NEXT: global_store_byte v[0:1], v7, off offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v6, off offset:10 +; ALIGNED-NEXT: global_store_byte v[0:1], v6, off offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v5, off offset:6 +; ALIGNED-NEXT: global_store_byte v[0:1], v5, off offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v4, off offset:2 +; ALIGNED-NEXT: global_store_byte v[0:1], v4, off ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 @@ -7658,653 +7678,660 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:15 -; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:13 -; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:11 -; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:9 -; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:7 -; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:5 -; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:3 -; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1 +; ALIGNED-NEXT: global_store_byte v[0:1], v8, off offset:15 +; ALIGNED-NEXT: global_store_byte v[0:1], v7, off offset:13 +; ALIGNED-NEXT: global_store_byte v[0:1], v9, off offset:11 +; ALIGNED-NEXT: global_store_byte v[0:1], v6, off offset:9 +; ALIGNED-NEXT: global_store_byte v[0:1], v10, off offset:7 +; ALIGNED-NEXT: global_store_byte v[0:1], v5, off offset:5 +; ALIGNED-NEXT: global_store_byte v[0:1], v11, off offset:3 +; ALIGNED-NEXT: global_store_byte v[0:1], v4, off offset:1 +; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; ALIGNED-NEXT: s_cbranch_scc1 .LBB6_2 -; ALIGNED-NEXT: .LBB6_3: ; %Flow7 -; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 +; ALIGNED-NEXT: .LBB6_3: ; %Flow17 +; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6 ; ALIGNED-NEXT: s_cbranch_execz .LBB6_6 ; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader -; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 -; ALIGNED-NEXT: s_mov_b32 s7, -1 +; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x700, v2 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v14, vcc_lo, 0x700, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, vcc_lo +; ALIGNED-NEXT: s_movk_i32 s4, 0xf800 +; ALIGNED-NEXT: s_mov_b32 s5, -1 ; ALIGNED-NEXT: .LBB6_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[24:25], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[24:25], off offset:224 -; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[24:25], off -; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[24:25], off offset:16 -; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[24:25], off offset:32 -; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[24:25], off offset:48 -; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[24:25], off offset:64 -; ALIGNED-NEXT: global_load_dwordx4 v[40:43], v[24:25], off offset:80 -; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[24:25], off offset:96 -; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[24:25], off offset:112 -; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[24:25], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[24:25], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[24:25], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[81:84], v[24:25], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[24:25], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[24:25], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[12:13], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[12:13], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[0:3], v[12:13], off +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16 +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:32 +; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[12:13], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[12:13], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[12:13], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[40:43], v[12:13], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[12:13], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[34:37], v[12:13], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[12:13], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[54:57], v[12:13], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[12:13], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[82:85], v[12:13], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[12:13], off offset:208 +; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0xffffff00, v12 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:348 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo -; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 -; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v31, off offset:254 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:252 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v33, off offset:254 +; ALIGNED-NEXT: global_store_byte v[14:15], v33, off offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v30, off offset:250 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:248 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v32, off offset:250 +; ALIGNED-NEXT: global_store_byte v[14:15], v32, off offset:248 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v25, off offset:246 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:244 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v27, off offset:246 +; ALIGNED-NEXT: global_store_byte v[14:15], v27, off offset:244 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v24, off offset:242 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:240 -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v26, off offset:242 +; ALIGNED-NEXT: global_store_byte v[14:15], v26, off offset:240 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v53, off offset:238 +; ALIGNED-NEXT: global_store_byte v[14:15], v53, off offset:236 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v52, off offset:234 +; ALIGNED-NEXT: global_store_byte v[14:15], v52, off offset:232 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v51, off offset:230 +; ALIGNED-NEXT: global_store_byte v[14:15], v51, off offset:228 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v50, off offset:226 +; ALIGNED-NEXT: global_store_byte v[14:15], v50, off offset:224 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:348 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v81, off offset:222 +; ALIGNED-NEXT: global_store_byte v[14:15], v81, off offset:220 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v80, off offset:218 +; ALIGNED-NEXT: global_store_byte v[14:15], v80, off offset:216 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v67, off offset:214 +; ALIGNED-NEXT: global_store_byte v[14:15], v67, off offset:212 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v66, off offset:210 +; ALIGNED-NEXT: global_store_byte v[14:15], v66, off offset:208 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:320 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v51, off offset:238 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:236 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v98, off offset:206 +; ALIGNED-NEXT: global_store_byte v[14:15], v98, off offset:204 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v50, off offset:234 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:232 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v87, off offset:202 +; ALIGNED-NEXT: global_store_byte v[14:15], v87, off offset:200 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v49, off offset:230 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:228 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v86, off offset:198 +; ALIGNED-NEXT: global_store_byte v[14:15], v86, off offset:196 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v36, off offset:226 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:224 -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v82, off offset:194 +; ALIGNED-NEXT: global_store_byte v[14:15], v82, off offset:192 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:412 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v99, off offset:190 +; ALIGNED-NEXT: global_store_byte v[14:15], v99, off offset:188 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v96, off offset:186 +; ALIGNED-NEXT: global_store_byte v[14:15], v96, off offset:184 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v83, off offset:182 +; ALIGNED-NEXT: global_store_byte v[14:15], v83, off offset:180 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v68, off offset:178 +; ALIGNED-NEXT: global_store_byte v[14:15], v68, off offset:176 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:416 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v71, off offset:222 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:220 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v97, off offset:174 +; ALIGNED-NEXT: global_store_byte v[14:15], v97, off offset:172 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v70, off offset:218 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:216 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v84, off offset:170 +; ALIGNED-NEXT: global_store_byte v[14:15], v84, off offset:168 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v65, off offset:214 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:212 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v69, off offset:166 +; ALIGNED-NEXT: global_store_byte v[14:15], v69, off offset:164 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v64, off offset:210 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:208 -; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v54, off offset:162 +; ALIGNED-NEXT: global_store_byte v[14:15], v54, off offset:160 +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v85, off offset:158 +; ALIGNED-NEXT: global_store_byte v[14:15], v85, off offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v70, off offset:154 +; ALIGNED-NEXT: global_store_byte v[14:15], v70, off offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v55, off offset:150 +; ALIGNED-NEXT: global_store_byte v[14:15], v55, off offset:148 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v38, off offset:146 +; ALIGNED-NEXT: global_store_byte v[14:15], v38, off offset:144 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v71, off offset:142 +; ALIGNED-NEXT: global_store_byte v[14:15], v71, off offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v64, off offset:138 +; ALIGNED-NEXT: global_store_byte v[14:15], v64, off offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v39, off offset:134 +; ALIGNED-NEXT: global_store_byte v[14:15], v39, off offset:132 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v34, off offset:130 +; ALIGNED-NEXT: global_store_byte v[14:15], v34, off offset:128 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v65, off offset:126 +; ALIGNED-NEXT: global_store_byte v[14:15], v65, off offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v48, off offset:122 +; ALIGNED-NEXT: global_store_byte v[14:15], v48, off offset:120 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v35, off offset:118 +; ALIGNED-NEXT: global_store_byte v[14:15], v35, off offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v28, off offset:114 +; ALIGNED-NEXT: global_store_byte v[14:15], v28, off offset:112 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:480 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v87, off offset:206 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:204 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v49, off offset:110 +; ALIGNED-NEXT: global_store_byte v[14:15], v49, off offset:108 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v86, off offset:202 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:200 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v36, off offset:106 +; ALIGNED-NEXT: global_store_byte v[14:15], v36, off offset:104 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v85, off offset:198 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:196 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v29, off offset:102 +; ALIGNED-NEXT: global_store_byte v[14:15], v29, off offset:100 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v80, off offset:194 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:192 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v22, off offset:98 +; ALIGNED-NEXT: global_store_byte v[14:15], v22, off offset:96 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:444 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:432 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v101, off offset:190 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:188 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v37, off offset:94 +; ALIGNED-NEXT: global_store_byte v[14:15], v37, off offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v99, off offset:186 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:184 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v30, off offset:90 +; ALIGNED-NEXT: global_store_byte v[14:15], v30, off offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v96, off offset:182 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:180 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v23, off offset:86 +; ALIGNED-NEXT: global_store_byte v[14:15], v23, off offset:84 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v81, off offset:178 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:176 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v19, off offset:82 +; ALIGNED-NEXT: global_store_byte v[14:15], v19, off offset:80 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:460 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v100, off offset:174 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:172 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v31, off offset:78 +; ALIGNED-NEXT: global_store_byte v[14:15], v31, off offset:76 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v97, off offset:170 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:168 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v24, off offset:74 +; ALIGNED-NEXT: global_store_byte v[14:15], v24, off offset:72 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v82, off offset:166 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:164 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v20, off offset:70 +; ALIGNED-NEXT: global_store_byte v[14:15], v20, off offset:68 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v66, off offset:162 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:160 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v17, off offset:66 +; ALIGNED-NEXT: global_store_byte v[14:15], v17, off offset:64 +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:540 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:364 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v98, off offset:158 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:156 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v25, off offset:62 +; ALIGNED-NEXT: global_store_byte v[14:15], v25, off offset:60 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v83, off offset:154 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:152 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v21, off offset:58 +; ALIGNED-NEXT: global_store_byte v[14:15], v21, off offset:56 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v67, off offset:150 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:148 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v18, off offset:54 +; ALIGNED-NEXT: global_store_byte v[14:15], v18, off offset:52 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v52, off offset:146 -; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:144 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v16, off offset:50 +; ALIGNED-NEXT: global_store_byte v[14:15], v16, off offset:48 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:556 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:380 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:556 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:548 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v84, off offset:142 -; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:140 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v11, off offset:42 +; ALIGNED-NEXT: global_store_byte v[14:15], v11, off offset:40 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v68, off offset:138 -; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:136 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v10, off offset:46 +; ALIGNED-NEXT: global_store_byte v[14:15], v10, off offset:44 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v53, off offset:134 -; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:132 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v9, off offset:34 +; ALIGNED-NEXT: global_store_byte v[14:15], v9, off offset:32 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v37, off offset:130 -; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:128 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v8, off offset:38 +; ALIGNED-NEXT: global_store_byte v[14:15], v8, off offset:36 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v69, off offset:126 -; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:124 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v7, off offset:30 +; ALIGNED-NEXT: global_store_byte v[14:15], v7, off offset:28 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v54, off offset:122 -; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:120 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v6, off offset:26 +; ALIGNED-NEXT: global_store_byte v[14:15], v6, off offset:24 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v38, off offset:118 -; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:116 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v5, off offset:22 +; ALIGNED-NEXT: global_store_byte v[14:15], v5, off offset:20 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v32, off offset:114 -; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:112 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v4, off offset:18 +; ALIGNED-NEXT: global_store_byte v[14:15], v4, off offset:16 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:524 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:476 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v55, off offset:110 -; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v39, off offset:106 -; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v33, off offset:102 -; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v26, off offset:98 -; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:96 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v48, off offset:94 -; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:92 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v34, off offset:90 -; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:88 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v27, off offset:86 -; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:84 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v21, off offset:82 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:80 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v35, off offset:78 -; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:76 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v28, off offset:74 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:72 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v22, off offset:70 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:68 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v19, off offset:66 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:64 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:524 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:524 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v29, off offset:62 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:60 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v23, off offset:58 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:56 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v20, off offset:54 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:52 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v18, off offset:50 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:532 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:540 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:42 -; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:40 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:46 -; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:44 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:34 -; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:38 -; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v11, off offset:30 -; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:28 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v10, off offset:26 -; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:24 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v9, off offset:22 -; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:20 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v8, off offset:18 -; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: global_store_byte v[14:15], v100, off offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: global_store_byte v[14:15], v33, off offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: global_store_byte v[14:15], v101, off offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v67 +; ALIGNED-NEXT: global_store_byte v[14:15], v81, off offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v70 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 +; ALIGNED-NEXT: global_store_byte v[14:15], v32, off offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; ALIGNED-NEXT: global_store_byte v[14:15], v102, off offset:247 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: global_store_byte v[14:15], v27, off offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v87 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 +; ALIGNED-NEXT: global_store_byte v[14:15], v103, off offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v86 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 +; ALIGNED-NEXT: global_store_byte v[14:15], v26, off offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 +; ALIGNED-NEXT: global_store_byte v[14:15], v112, off offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 +; ALIGNED-NEXT: global_store_byte v[14:15], v53, off offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v96 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 +; ALIGNED-NEXT: global_store_byte v[14:15], v113, off offset:235 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v83 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: global_store_byte v[14:15], v52, off offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v68 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 +; ALIGNED-NEXT: global_store_byte v[14:15], v114, off offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: global_store_byte v[14:15], v51, off offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: global_store_byte v[14:15], v115, off offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v69 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 +; ALIGNED-NEXT: global_store_byte v[14:15], v50, off offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v54 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 +; ALIGNED-NEXT: global_store_byte v[14:15], v100, off offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: global_store_byte v[14:15], v33, off offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v55 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 +; ALIGNED-NEXT: global_store_byte v[14:15], v80, off offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: global_store_byte v[14:15], v101, off offset:215 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v71 +; ALIGNED-NEXT: global_store_byte v[14:15], v70, off offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: global_store_byte v[14:15], v67, off offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: global_store_byte v[14:15], v32, off offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v39 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 +; ALIGNED-NEXT: global_store_byte v[14:15], v66, off offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v34 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: global_store_byte v[14:15], v102, off offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: global_store_byte v[14:15], v98, off offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 +; ALIGNED-NEXT: global_store_byte v[14:15], v27, off offset:203 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v35 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; ALIGNED-NEXT: global_store_byte v[14:15], v87, off offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v28 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 +; ALIGNED-NEXT: global_store_byte v[14:15], v103, off offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: global_store_byte v[14:15], v86, off offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: global_store_byte v[14:15], v26, off offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v29 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 +; ALIGNED-NEXT: global_store_byte v[14:15], v82, off offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 +; ALIGNED-NEXT: global_store_byte v[14:15], v112, off offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: global_store_byte v[14:15], v99, off offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: global_store_byte v[14:15], v53, off offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v23 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 +; ALIGNED-NEXT: global_store_byte v[14:15], v96, off offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: global_store_byte v[14:15], v113, off offset:183 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: global_store_byte v[14:15], v83, off offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 +; ALIGNED-NEXT: global_store_byte v[14:15], v52, off offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v20 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: global_store_byte v[14:15], v68, off offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; ALIGNED-NEXT: global_store_byte v[14:15], v114, off offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 +; ALIGNED-NEXT: global_store_byte v[14:15], v97, off offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 +; ALIGNED-NEXT: global_store_byte v[14:15], v51, off offset:171 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v18 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: global_store_byte v[14:15], v84, off offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; ALIGNED-NEXT: global_store_byte v[14:15], v115, off offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v11 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 +; ALIGNED-NEXT: global_store_byte v[14:15], v69, off offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: global_store_byte v[14:15], v50, off offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:147 -; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:145 -; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:143 -; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:141 -; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:139 -; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:137 -; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:135 -; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:133 -; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:131 -; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:129 -; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:127 -; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:125 -; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:123 -; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:121 -; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:119 -; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:117 -; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:115 -; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:113 -; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:111 -; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:109 -; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:107 -; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:105 -; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:103 -; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:101 -; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:99 -; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:97 -; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:95 -; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:93 -; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:91 -; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:89 -; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:87 -; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:85 -; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:83 -; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:81 -; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:79 -; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:77 -; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:75 -; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:73 -; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:71 -; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:69 -; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:67 -; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:65 -; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:63 -; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:61 -; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:59 -; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:57 -; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:55 -; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:53 -; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:51 -; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:49 -; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:43 -; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:41 -; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:47 -; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:45 -; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:35 -; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:33 -; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:39 -; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:37 -; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:31 -; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:29 -; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:27 -; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:25 -; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:23 -; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:21 -; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:19 -; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v7, off offset:14 -; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v6, off offset:10 -; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v5, off offset:6 -; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v4, off offset:2 -; ALIGNED-NEXT: global_store_byte v[16:17], v4, off -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 +; ALIGNED-NEXT: global_store_byte v[14:15], v54, off offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: global_store_byte v[14:15], v100, off offset:159 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; ALIGNED-NEXT: global_store_byte v[14:15], v85, off offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v6 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 8, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; ALIGNED-NEXT: global_store_byte v[14:15], v81, off offset:155 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:15 -; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:13 -; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:11 -; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:9 -; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:7 -; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:5 -; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:3 -; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1 +; ALIGNED-NEXT: global_store_byte v[14:15], v33, off offset:151 +; ALIGNED-NEXT: global_store_byte v[14:15], v55, off offset:149 +; ALIGNED-NEXT: global_store_byte v[14:15], v80, off offset:147 +; ALIGNED-NEXT: global_store_byte v[14:15], v38, off offset:145 +; ALIGNED-NEXT: global_store_byte v[14:15], v101, off offset:143 +; ALIGNED-NEXT: global_store_byte v[14:15], v71, off offset:141 +; ALIGNED-NEXT: global_store_byte v[14:15], v67, off offset:139 +; ALIGNED-NEXT: global_store_byte v[14:15], v64, off offset:137 +; ALIGNED-NEXT: global_store_byte v[14:15], v32, off offset:135 +; ALIGNED-NEXT: global_store_byte v[14:15], v39, off offset:133 +; ALIGNED-NEXT: global_store_byte v[14:15], v66, off offset:131 +; ALIGNED-NEXT: global_store_byte v[14:15], v34, off offset:129 +; ALIGNED-NEXT: global_store_byte v[14:15], v102, off offset:127 +; ALIGNED-NEXT: global_store_byte v[14:15], v65, off offset:125 +; ALIGNED-NEXT: global_store_byte v[14:15], v98, off offset:123 +; ALIGNED-NEXT: global_store_byte v[14:15], v48, off offset:121 +; ALIGNED-NEXT: global_store_byte v[14:15], v27, off offset:119 +; ALIGNED-NEXT: global_store_byte v[14:15], v35, off offset:117 +; ALIGNED-NEXT: global_store_byte v[14:15], v87, off offset:115 +; ALIGNED-NEXT: global_store_byte v[14:15], v28, off offset:113 +; ALIGNED-NEXT: global_store_byte v[14:15], v103, off offset:111 +; ALIGNED-NEXT: global_store_byte v[14:15], v49, off offset:109 +; ALIGNED-NEXT: global_store_byte v[14:15], v86, off offset:107 +; ALIGNED-NEXT: global_store_byte v[14:15], v36, off offset:105 +; ALIGNED-NEXT: global_store_byte v[14:15], v26, off offset:103 +; ALIGNED-NEXT: global_store_byte v[14:15], v29, off offset:101 +; ALIGNED-NEXT: global_store_byte v[14:15], v82, off offset:99 +; ALIGNED-NEXT: global_store_byte v[14:15], v22, off offset:97 +; ALIGNED-NEXT: global_store_byte v[14:15], v112, off offset:95 +; ALIGNED-NEXT: global_store_byte v[14:15], v37, off offset:93 +; ALIGNED-NEXT: global_store_byte v[14:15], v99, off offset:91 +; ALIGNED-NEXT: global_store_byte v[14:15], v30, off offset:89 +; ALIGNED-NEXT: global_store_byte v[14:15], v53, off offset:87 +; ALIGNED-NEXT: global_store_byte v[14:15], v23, off offset:85 +; ALIGNED-NEXT: global_store_byte v[14:15], v96, off offset:83 +; ALIGNED-NEXT: global_store_byte v[14:15], v19, off offset:81 +; ALIGNED-NEXT: global_store_byte v[14:15], v113, off offset:79 +; ALIGNED-NEXT: global_store_byte v[14:15], v31, off offset:77 +; ALIGNED-NEXT: global_store_byte v[14:15], v83, off offset:75 +; ALIGNED-NEXT: global_store_byte v[14:15], v24, off offset:73 +; ALIGNED-NEXT: global_store_byte v[14:15], v52, off offset:71 +; ALIGNED-NEXT: global_store_byte v[14:15], v20, off offset:69 +; ALIGNED-NEXT: global_store_byte v[14:15], v68, off offset:67 +; ALIGNED-NEXT: global_store_byte v[14:15], v17, off offset:65 +; ALIGNED-NEXT: global_store_byte v[14:15], v114, off offset:63 +; ALIGNED-NEXT: global_store_byte v[14:15], v25, off offset:61 +; ALIGNED-NEXT: global_store_byte v[14:15], v97, off offset:59 +; ALIGNED-NEXT: global_store_byte v[14:15], v21, off offset:57 +; ALIGNED-NEXT: global_store_byte v[14:15], v51, off offset:55 +; ALIGNED-NEXT: global_store_byte v[14:15], v18, off offset:53 +; ALIGNED-NEXT: global_store_byte v[14:15], v84, off offset:51 +; ALIGNED-NEXT: global_store_byte v[14:15], v16, off offset:49 +; ALIGNED-NEXT: global_store_byte v[14:15], v115, off offset:43 +; ALIGNED-NEXT: global_store_byte v[14:15], v11, off offset:41 +; ALIGNED-NEXT: global_store_byte v[14:15], v69, off offset:47 +; ALIGNED-NEXT: global_store_byte v[14:15], v10, off offset:45 +; ALIGNED-NEXT: global_store_byte v[14:15], v50, off offset:35 +; ALIGNED-NEXT: global_store_byte v[14:15], v9, off offset:33 +; ALIGNED-NEXT: global_store_byte v[14:15], v54, off offset:39 +; ALIGNED-NEXT: global_store_byte v[14:15], v8, off offset:37 +; ALIGNED-NEXT: global_store_byte v[14:15], v100, off offset:31 +; ALIGNED-NEXT: global_store_byte v[14:15], v7, off offset:29 +; ALIGNED-NEXT: global_store_byte v[14:15], v85, off offset:27 +; ALIGNED-NEXT: global_store_byte v[14:15], v6, off offset:25 +; ALIGNED-NEXT: global_store_byte v[14:15], v81, off offset:23 +; ALIGNED-NEXT: global_store_byte v[14:15], v5, off offset:21 +; ALIGNED-NEXT: global_store_byte v[14:15], v70, off offset:19 +; ALIGNED-NEXT: global_store_byte v[14:15], v4, off offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v3, off offset:14 +; ALIGNED-NEXT: global_store_byte v[14:15], v3, off offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v2, off offset:10 +; ALIGNED-NEXT: global_store_byte v[14:15], v2, off offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v1, off offset:6 +; ALIGNED-NEXT: global_store_byte v[14:15], v1, off offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v0, off offset:2 +; ALIGNED-NEXT: global_store_byte v[14:15], v0, off +; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 8, v3 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; ALIGNED-NEXT: global_store_byte v[14:15], v4, off offset:15 +; ALIGNED-NEXT: global_store_byte v[14:15], v3, off offset:13 +; ALIGNED-NEXT: global_store_byte v[14:15], v5, off offset:11 +; ALIGNED-NEXT: global_store_byte v[14:15], v2, off offset:9 +; ALIGNED-NEXT: global_store_byte v[14:15], v6, off offset:7 +; ALIGNED-NEXT: global_store_byte v[14:15], v1, off offset:5 +; ALIGNED-NEXT: global_store_byte v[14:15], v7, off offset:3 +; ALIGNED-NEXT: global_store_byte v[14:15], v0, off offset:1 +; ALIGNED-NEXT: v_add_co_u32 v14, vcc_lo, 0xffffff00, v14 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v15, null, -1, v15, vcc_lo ; ALIGNED-NEXT: s_cbranch_scc0 .LBB6_5 -; ALIGNED-NEXT: .LBB6_6: ; %Flow8 -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: .LBB6_6: ; %Flow18 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; ALIGNED-NEXT: s_clause 0x9 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -8316,27 +8343,31 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s4 ; UNROLL3-NEXT: s_cbranch_execz .LBB6_4 ; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: v_mov_b32_e32 v5, v3 +; UNROLL3-NEXT: v_mov_b32_e32 v7, v1 +; UNROLL3-NEXT: v_mov_b32_e32 v4, v2 +; UNROLL3-NEXT: v_mov_b32_e32 v6, v0 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7e0 ; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB6_2: ; %memmove_fwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 -; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo -; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; UNROLL3-NEXT: s_clause 0x2 -; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off -; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:16 -; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32 -; UNROLL3-NEXT: s_add_u32 s4, s4, 48 -; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[4:5], off +; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:16 +; UNROLL3-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:32 +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, v4, 48 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 +; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 ; UNROLL3-NEXT: s_waitcnt vmcnt(2) -; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[4:7], off +; UNROLL3-NEXT: global_store_dwordx4 v[6:7], v[8:11], off ; UNROLL3-NEXT: s_waitcnt vmcnt(1) -; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:16 +; UNROLL3-NEXT: global_store_dwordx4 v[6:7], v[12:15], off offset:16 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:32 -; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0x7e0 +; UNROLL3-NEXT: global_store_dwordx4 v[6:7], v[16:19], off offset:32 +; UNROLL3-NEXT: v_add_co_u32 v6, vcc_lo, v6, 48 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc1 .LBB6_2 ; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual ; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2016 @@ -8345,44 +8376,47 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; UNROLL3-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:2032 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:2032 -; UNROLL3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; UNROLL3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; UNROLL3-NEXT: .LBB6_4: ; %Flow5 -; UNROLL3-NEXT: s_andn2_saveexec_b32 s8, s6 +; UNROLL3-NEXT: ; implicit-def: $vgpr2 +; UNROLL3-NEXT: ; implicit-def: $vgpr0 +; UNROLL3-NEXT: .LBB6_4: ; %Flow15 +; UNROLL3-NEXT: s_andn2_saveexec_b32 s6, s6 ; UNROLL3-NEXT: s_cbranch_execz .LBB6_7 ; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual ; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2032 -; UNROLL3-NEXT: s_movk_i32 s6, 0xffd0 -; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7b0 -; UNROLL3-NEXT: s_mov_b32 s7, -1 +; UNROLL3-NEXT: s_movk_i32 s4, 0xf820 +; UNROLL3-NEXT: s_mov_b32 s5, -1 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:2032 -; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2016 +; UNROLL3-NEXT: global_load_dwordx4 v[6:9], v[2:3], off offset:2016 +; UNROLL3-NEXT: v_add_co_u32 v2, vcc_lo, 0x7b0, v2 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0x7b0, v0 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:2016 +; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:2016 ; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB6_6: ; %memmove_bwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 -; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo -; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; UNROLL3-NEXT: s_clause 0x2 -; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off -; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:16 -; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32 -; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 -; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 +; UNROLL3-NEXT: global_load_dwordx4 v[6:9], v[2:3], off +; UNROLL3-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 +; UNROLL3-NEXT: global_load_dwordx4 v[14:17], v[2:3], off offset:32 +; UNROLL3-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffffd0, v2 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 ; UNROLL3-NEXT: s_waitcnt vmcnt(2) -; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[4:7], off +; UNROLL3-NEXT: global_store_dwordx4 v[4:5], v[6:9], off ; UNROLL3-NEXT: s_waitcnt vmcnt(1) -; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:16 +; UNROLL3-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:16 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:32 -; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; UNROLL3-NEXT: global_store_dwordx4 v[4:5], v[14:17], off offset:32 +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffffd0, v4 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, vcc_lo +; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc0 .LBB6_6 -; UNROLL3-NEXT: .LBB6_7: ; %Flow6 -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; UNROLL3-NEXT: .LBB6_7: ; %Flow16 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; UNROLL3-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 2048, i1 false) @@ -8398,1137 +8432,1144 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; CHECK-NEXT: s_xor_b32 s6, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execz .LBB7_3 ; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0x800 ; CHECK-NEXT: .LBB7_2: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo -; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:240 -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:224 -; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:208 -; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:192 -; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:176 -; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:160 -; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:144 -; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:128 -; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:112 -; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:96 -; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:80 -; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:64 -; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:48 -; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:32 -; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off offset:16 -; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off -; CHECK-NEXT: s_add_u32 s4, s4, 0x100 -; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:240 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:224 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:208 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:192 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:176 +; CHECK-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:160 +; CHECK-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:144 +; CHECK-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:128 +; CHECK-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:112 +; CHECK-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:96 +; CHECK-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:80 +; CHECK-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:64 +; CHECK-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:48 +; CHECK-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:32 +; CHECK-NEXT: global_load_dwordx4 v[84:87], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[96:99], v[2:3], off +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: s_addc_u32 s5, s5, -1 ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:224 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[12:15] offset:208 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[20:23] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[24:27] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[28:31] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[32:35] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[36:39] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[48:51] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[52:55] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[64:67] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[68:71] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[80:83] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[84:87] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] -; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[96:99] +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB7_2 -; CHECK-NEXT: .LBB7_3: ; %Flow6 -; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6 +; CHECK-NEXT: .LBB7_3: ; %Flow16 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6 ; CHECK-NEXT: s_cbranch_execz .LBB7_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader -; CHECK-NEXT: s_movk_i32 s6, 0xff00 -; CHECK-NEXT: s_mov_b64 s[4:5], 0x700 -; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x700, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; CHECK-NEXT: s_movk_i32 s4, 0xf800 +; CHECK-NEXT: s_mov_b32 s5, -1 ; CHECK-NEXT: .LBB7_5: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo -; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:240 -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:224 -; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:208 -; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:192 -; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:176 -; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:160 -; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:144 -; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:128 -; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:112 -; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:96 -; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:80 -; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:64 -; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:48 -; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:32 -; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off offset:16 -; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off -; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 -; CHECK-NEXT: s_addc_u32 s5, s5, -1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2032 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:2016 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:2000 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:1984 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:1968 +; CHECK-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:1952 +; CHECK-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:1936 +; CHECK-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:1920 +; CHECK-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:1904 +; CHECK-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:1888 +; CHECK-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:1872 +; CHECK-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:1856 +; CHECK-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:1840 +; CHECK-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:1824 +; CHECK-NEXT: global_load_dwordx4 v[84:87], v[2:3], off offset:1808 +; CHECK-NEXT: global_load_dwordx4 v[96:99], v[2:3], off offset:1792 +; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff00, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:224 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[12:15] offset:208 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[20:23] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[24:27] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[28:31] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[32:35] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[36:39] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[48:51] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[52:55] offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[64:67] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[68:71] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[80:83] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[84:87] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] -; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[96:99] +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffff00, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB7_5 -; CHECK-NEXT: .LBB7_6: ; %Flow7 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: .LBB7_6: ; %Flow17 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; ; ALIGNED-LABEL: memmove_p0_p4_sz2048: ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_mov_b32 s4, exec_lo ; ALIGNED-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 ; ALIGNED-NEXT: s_cbranch_execz .LBB7_3 ; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x800 ; ALIGNED-NEXT: .LBB7_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 -; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 -; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[4:5], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[4:5], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[4:5], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[4:5], off offset:112 -; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[4:5], off offset:96 -; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[4:5], off offset:80 -; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[4:5], off offset:64 -; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:48 -; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 -; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 -; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off -; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 -; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[2:3], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[2:3], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:32 +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16 +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[2:3], off +; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 +; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v114 offset:250 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v115 offset:254 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:252 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:248 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v113 offset:246 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v112 offset:242 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:240 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v98 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v98 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v99 offset:254 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v99 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:252 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:248 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v97 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v97 offset:246 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v97 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:244 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v96 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v96 offset:242 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:240 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:251 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:253 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:249 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:247 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:255 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:243 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:247 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v82 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v82 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v83 +; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v86 offset:234 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v87 offset:238 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:236 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v85 offset:230 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v84 offset:226 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:235 ; ALIGNED-NEXT: s_waitcnt vmcnt(12) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v69 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v70 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v68 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v82 offset:218 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v83 offset:222 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:220 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:216 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v81 offset:214 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v80 offset:210 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:208 ; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v64 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:215 ; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v54 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v55 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v70 offset:202 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v71 offset:206 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:204 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:200 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v69 offset:198 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v68 offset:194 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:192 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:203 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v50 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v49 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v66 offset:186 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v67 offset:190 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:188 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:184 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v65 offset:182 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v64 offset:178 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:176 ; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v39 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v37 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v36 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:183 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v34 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v35 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v54 offset:170 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:168 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v55 offset:174 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:172 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v52 offset:162 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:160 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v53 offset:166 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:164 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:171 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v29 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v30 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:175 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v31 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v29 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v28 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v50 offset:154 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v51 offset:158 +; ALIGNED-NEXT: flat_store_byte v[0:1], v51 offset:156 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:152 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v49 offset:150 +; ALIGNED-NEXT: flat_store_byte v[0:1], v49 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v48 offset:146 +; ALIGNED-NEXT: flat_store_byte v[0:1], v48 offset:144 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:155 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:153 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:159 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:157 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:155 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:153 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:159 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:157 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:151 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:133 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:127 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:125 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:119 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:117 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v22 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:149 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:147 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:145 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v38 offset:138 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v39 offset:142 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:140 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:136 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v37 offset:134 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v36 offset:130 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:139 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v18 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:137 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:143 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:141 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:135 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:133 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:131 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:129 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v34 offset:122 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v35 offset:126 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:124 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:120 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v33 offset:118 +; ALIGNED-NEXT: flat_store_byte v[0:1], v33 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v32 offset:114 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:123 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:121 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:127 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:125 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:119 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:111 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:109 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v6 -; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:103 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:107 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:105 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v7 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v4 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:95 -; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:93 -; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:28 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 -; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 -; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:63 -; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:61 -; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 -; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 -; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:39 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:35 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 -; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:19 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:17 -; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:9 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:15 -; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:13 -; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:3 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:1 -; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:117 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:115 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:113 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[0:1], v28 offset:96 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:107 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:105 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:111 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:109 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v7 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v5 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v5 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v4 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v4 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v26 offset:90 +; ALIGNED-NEXT: flat_store_byte v[0:1], v48 offset:91 +; ALIGNED-NEXT: flat_store_byte v[0:1], v49 offset:89 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v27 offset:94 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:95 +; ALIGNED-NEXT: flat_store_byte v[0:1], v27 offset:92 +; ALIGNED-NEXT: flat_store_byte v[0:1], v51 offset:93 +; ALIGNED-NEXT: flat_store_byte v[0:1], v26 offset:88 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:87 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v25 offset:86 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:85 +; ALIGNED-NEXT: flat_store_byte v[0:1], v25 offset:84 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:83 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v24 offset:82 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:81 +; ALIGNED-NEXT: flat_store_byte v[0:1], v24 offset:80 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v22 offset:74 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:75 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:73 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v23 offset:78 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:79 +; ALIGNED-NEXT: flat_store_byte v[0:1], v23 offset:76 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:77 +; ALIGNED-NEXT: flat_store_byte v[0:1], v22 offset:72 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:71 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v21 offset:70 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:69 +; ALIGNED-NEXT: flat_store_byte v[0:1], v21 offset:68 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:67 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v20 offset:66 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:65 +; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:64 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v18 offset:58 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:59 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:57 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v19 offset:62 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:63 +; ALIGNED-NEXT: flat_store_byte v[0:1], v19 offset:60 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:61 +; ALIGNED-NEXT: flat_store_byte v[0:1], v18 offset:56 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:55 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v17 offset:54 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:53 +; ALIGNED-NEXT: flat_store_byte v[0:1], v17 offset:52 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:51 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v16 offset:50 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:49 +; ALIGNED-NEXT: flat_store_byte v[0:1], v16 offset:48 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v14 offset:42 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:43 +; ALIGNED-NEXT: flat_store_byte v[0:1], v33 offset:41 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v15 offset:46 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:47 +; ALIGNED-NEXT: flat_store_byte v[0:1], v15 offset:44 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:45 +; ALIGNED-NEXT: flat_store_byte v[0:1], v14 offset:40 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:39 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v13 offset:38 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:37 +; ALIGNED-NEXT: flat_store_byte v[0:1], v13 offset:36 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:35 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v12 offset:34 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:33 +; ALIGNED-NEXT: flat_store_byte v[0:1], v12 offset:32 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:26 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:27 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:25 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v11 offset:30 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:31 +; ALIGNED-NEXT: flat_store_byte v[0:1], v11 offset:28 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:29 +; ALIGNED-NEXT: flat_store_byte v[0:1], v10 offset:24 +; ALIGNED-NEXT: flat_store_byte v[0:1], v28 offset:23 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:22 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:21 +; ALIGNED-NEXT: flat_store_byte v[0:1], v9 offset:20 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:19 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:18 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:17 +; ALIGNED-NEXT: flat_store_byte v[0:1], v8 offset:16 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v6 offset:10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:11 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:9 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v7 offset:14 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:15 +; ALIGNED-NEXT: flat_store_byte v[0:1], v7 offset:12 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:13 +; ALIGNED-NEXT: flat_store_byte v[0:1], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:7 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v5 offset:6 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:5 +; ALIGNED-NEXT: flat_store_byte v[0:1], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:3 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:1 +; ALIGNED-NEXT: flat_store_byte v[0:1], v4 +; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB7_2 -; ALIGNED-NEXT: .LBB7_3: ; %Flow6 -; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 +; ALIGNED-NEXT: .LBB7_3: ; %Flow16 +; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6 ; ALIGNED-NEXT: s_cbranch_execz .LBB7_6 ; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader -; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 -; ALIGNED-NEXT: s_mov_b32 s7, -1 +; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0x700, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; ALIGNED-NEXT: s_movk_i32 s4, 0xf800 +; ALIGNED-NEXT: s_mov_b32 s5, -1 ; ALIGNED-NEXT: .LBB7_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 -; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 -; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[4:5], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[4:5], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[4:5], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[4:5], off offset:112 -; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[4:5], off offset:96 -; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[4:5], off offset:80 -; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[4:5], off offset:64 -; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:48 -; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 -; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 -; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off -; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 -; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 +; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[2:3], off offset:2032 +; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[2:3], off offset:2016 +; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:2000 +; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:1984 +; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:1968 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:1952 +; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:1936 +; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:1920 +; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:1904 +; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:1888 +; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:1872 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:1856 +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:1840 +; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:1824 +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:1808 +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:1792 +; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff00, v2 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:250 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:254 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:252 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:248 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:246 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:242 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:240 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v98 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v98 offset:250 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v99 offset:254 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:252 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:248 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v97 offset:246 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v96 offset:242 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v98 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:243 -; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v82 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:239 -; ALIGNED-NEXT: s_waitcnt vmcnt(12) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v69 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 -; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:207 -; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 -; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:251 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v85 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:255 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:253 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:247 +; ALIGNED-NEXT: s_waitcnt vmcnt(13) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v82 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v83 +; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v86 offset:234 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v87 offset:238 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:236 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v85 offset:230 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v84 offset:226 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v81 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:235 +; ALIGNED-NEXT: s_waitcnt vmcnt(12) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v70 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:231 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:227 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v68 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v82 offset:218 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v83 offset:222 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:220 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:216 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v81 offset:214 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v80 offset:210 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:208 +; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v67 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:219 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:223 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v64 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:215 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v54 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:211 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v55 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v70 offset:202 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v71 offset:206 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:204 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:200 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v69 offset:198 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v68 offset:194 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:192 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:203 +; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v50 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:207 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:199 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v49 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:195 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v66 offset:186 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v67 offset:190 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:188 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:184 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v65 offset:182 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v64 offset:178 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:176 +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 8, v39 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:187 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v37 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:191 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v36 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:183 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v34 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:179 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v35 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v54 offset:170 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v55 offset:174 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:172 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:168 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v53 offset:166 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:164 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v52 offset:162 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v40, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v41, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v42, 8, v32 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:167 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v29 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 -; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:155 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:153 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:159 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:157 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:476 -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:135 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:133 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:127 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:125 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:119 -; ALIGNED-NEXT: flat_store_byte v[96:97], v116 offset:117 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:109 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v6 -; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:107 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:105 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v7 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 -; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 -; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v4 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:95 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:93 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 -; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:364 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:63 -; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:61 -; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 -; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 -; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:39 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:35 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 -; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:19 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:17 -; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:348 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:9 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:15 -; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:13 -; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:3 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:1 -; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v31 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:171 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:169 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:175 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:167 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v29 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:165 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:163 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v28 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v50 offset:154 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v51 offset:158 +; ALIGNED-NEXT: flat_store_byte v[0:1], v51 offset:156 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:152 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v49 offset:150 +; ALIGNED-NEXT: flat_store_byte v[0:1], v49 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v48 offset:146 +; ALIGNED-NEXT: flat_store_byte v[0:1], v48 offset:144 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:155 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:153 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:159 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:157 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:151 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:149 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:147 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:145 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v38 offset:138 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v39 offset:142 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:140 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:136 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v37 offset:134 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v36 offset:130 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:139 +; ALIGNED-NEXT: flat_store_byte v[0:1], v116 offset:137 +; ALIGNED-NEXT: flat_store_byte v[0:1], v117 offset:143 +; ALIGNED-NEXT: flat_store_byte v[0:1], v118 offset:141 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:135 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:133 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:131 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:129 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v34 offset:122 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v35 offset:126 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:124 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:120 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v33 offset:118 +; ALIGNED-NEXT: flat_store_byte v[0:1], v33 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v32 offset:114 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v18 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:123 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:121 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:127 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:125 +; ALIGNED-NEXT: flat_store_byte v[0:1], v119 offset:119 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v40 offset:117 +; ALIGNED-NEXT: flat_store_byte v[0:1], v41 offset:115 +; ALIGNED-NEXT: flat_store_byte v[0:1], v42 offset:113 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[0:1], v28 offset:96 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:107 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:105 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:111 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:109 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v7 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v5 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v5 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v4 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v4 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v26 offset:90 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v27 offset:94 +; ALIGNED-NEXT: flat_store_byte v[0:1], v27 offset:92 +; ALIGNED-NEXT: flat_store_byte v[0:1], v26 offset:88 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v25 offset:86 +; ALIGNED-NEXT: flat_store_byte v[0:1], v25 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v24 offset:82 +; ALIGNED-NEXT: flat_store_byte v[0:1], v24 offset:80 +; ALIGNED-NEXT: flat_store_byte v[0:1], v48 offset:91 +; ALIGNED-NEXT: flat_store_byte v[0:1], v49 offset:89 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:95 +; ALIGNED-NEXT: flat_store_byte v[0:1], v51 offset:93 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:87 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:85 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:83 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:81 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v22 offset:74 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v23 offset:78 +; ALIGNED-NEXT: flat_store_byte v[0:1], v23 offset:76 +; ALIGNED-NEXT: flat_store_byte v[0:1], v22 offset:72 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v21 offset:70 +; ALIGNED-NEXT: flat_store_byte v[0:1], v21 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v20 offset:66 +; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:64 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:75 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:73 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:79 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:77 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:71 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:69 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:67 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:65 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v18 offset:58 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:59 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:57 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v19 offset:62 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:63 +; ALIGNED-NEXT: flat_store_byte v[0:1], v19 offset:60 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:61 +; ALIGNED-NEXT: flat_store_byte v[0:1], v18 offset:56 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:55 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v17 offset:54 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:53 +; ALIGNED-NEXT: flat_store_byte v[0:1], v17 offset:52 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:51 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v16 offset:50 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:49 +; ALIGNED-NEXT: flat_store_byte v[0:1], v16 offset:48 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:43 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v14 offset:42 +; ALIGNED-NEXT: flat_store_byte v[0:1], v33 offset:41 +; ALIGNED-NEXT: flat_store_byte v[0:1], v14 offset:40 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:47 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v15 offset:46 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:45 +; ALIGNED-NEXT: flat_store_byte v[0:1], v15 offset:44 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:35 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v12 offset:34 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:33 +; ALIGNED-NEXT: flat_store_byte v[0:1], v12 offset:32 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:39 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v13 offset:38 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:37 +; ALIGNED-NEXT: flat_store_byte v[0:1], v13 offset:36 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:26 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:27 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:25 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v11 offset:30 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:31 +; ALIGNED-NEXT: flat_store_byte v[0:1], v11 offset:28 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:29 +; ALIGNED-NEXT: flat_store_byte v[0:1], v10 offset:24 +; ALIGNED-NEXT: flat_store_byte v[0:1], v28 offset:23 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:22 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:21 +; ALIGNED-NEXT: flat_store_byte v[0:1], v9 offset:20 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:19 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:18 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:17 +; ALIGNED-NEXT: flat_store_byte v[0:1], v8 offset:16 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v6 offset:10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:11 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:9 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v7 offset:14 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:15 +; ALIGNED-NEXT: flat_store_byte v[0:1], v7 offset:12 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:13 +; ALIGNED-NEXT: flat_store_byte v[0:1], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:7 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v5 offset:6 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:5 +; ALIGNED-NEXT: flat_store_byte v[0:1], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:3 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:1 +; ALIGNED-NEXT: flat_store_byte v[0:1], v4 +; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffff00, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB7_5 -; ALIGNED-NEXT: .LBB7_6: ; %Flow7 -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-NEXT: .LBB7_6: ; %Flow17 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; UNROLL3-LABEL: memmove_p0_p4_sz2048: @@ -9539,27 +9580,31 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s4 ; UNROLL3-NEXT: s_cbranch_execz .LBB7_4 ; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 +; UNROLL3-NEXT: v_mov_b32_e32 v5, v3 +; UNROLL3-NEXT: v_mov_b32_e32 v7, v1 +; UNROLL3-NEXT: v_mov_b32_e32 v4, v2 +; UNROLL3-NEXT: v_mov_b32_e32 v6, v0 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7e0 ; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB7_2: ; %memmove_fwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 -; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo -; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; UNROLL3-NEXT: s_clause 0x2 -; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16 -; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off -; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32 -; UNROLL3-NEXT: s_add_u32 s4, s4, 48 -; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 +; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[4:5], off +; UNROLL3-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:32 +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, v4, 48 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 +; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 ; UNROLL3-NEXT: s_waitcnt vmcnt(2) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[8:11] offset:16 ; UNROLL3-NEXT: s_waitcnt vmcnt(1) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[12:15] ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 -; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0x7e0 +; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[16:19] offset:32 +; UNROLL3-NEXT: v_add_co_u32 v6, vcc_lo, v6, 48 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc1 .LBB7_2 ; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual ; UNROLL3-NEXT: s_clause 0x1 @@ -9570,44 +9615,45 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2016 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:2032 -; UNROLL3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; UNROLL3-NEXT: .LBB7_4: ; %Flow4 -; UNROLL3-NEXT: s_andn2_saveexec_b32 s8, s6 +; UNROLL3-NEXT: ; implicit-def: $vgpr0 +; UNROLL3-NEXT: .LBB7_4: ; %Flow14 +; UNROLL3-NEXT: s_andn2_saveexec_b32 s6, s6 ; UNROLL3-NEXT: s_cbranch_execz .LBB7_7 ; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual ; UNROLL3-NEXT: s_clause 0x1 -; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2032 -; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:2016 -; UNROLL3-NEXT: s_movk_i32 s6, 0xffd0 -; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7b0 -; UNROLL3-NEXT: s_mov_b32 s7, -1 +; UNROLL3-NEXT: global_load_dwordx4 v[6:9], v[2:3], off offset:2032 +; UNROLL3-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:2016 +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0x7b0, v0 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo +; UNROLL3-NEXT: s_movk_i32 s4, 0xf820 +; UNROLL3-NEXT: s_mov_b32 s5, -1 ; UNROLL3-NEXT: s_waitcnt vmcnt(1) -; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2032 +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:2032 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:2016 +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[10:13] offset:2016 ; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB7_6: ; %memmove_bwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 -; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo -; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo ; UNROLL3-NEXT: s_clause 0x2 -; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16 -; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off -; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32 -; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 -; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 +; UNROLL3-NEXT: global_load_dwordx4 v[6:9], v[2:3], off offset:1984 +; UNROLL3-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:1968 +; UNROLL3-NEXT: global_load_dwordx4 v[14:17], v[2:3], off offset:2000 +; UNROLL3-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffffd0, v2 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 ; UNROLL3-NEXT: s_waitcnt vmcnt(2) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[6:9] offset:16 ; UNROLL3-NEXT: s_waitcnt vmcnt(1) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[10:13] ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 -; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[14:17] offset:32 +; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffffd0, v4 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, vcc_lo +; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc0 .LBB7_6 -; UNROLL3-NEXT: .LBB7_7: ; %Flow5 -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; UNROLL3-NEXT: .LBB7_7: ; %Flow15 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) ; UNROLL3-NEXT: s_setpc_b64 s[30:31] entry: @@ -9829,207 +9875,205 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6 ; CHECK-NEXT: s_cbranch_execz .LBB8_6 ; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader -; CHECK-NEXT: v_add_nc_u32_e32 v1, 0x700, v1 -; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x700, v0 ; CHECK-NEXT: s_movk_i32 s4, 0xf800 ; CHECK-NEXT: s_mov_b32 s5, -1 ; CHECK-NEXT: .LBB8_5: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e -; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:252 -; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:248 -; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:244 -; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:240 -; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:236 -; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:232 -; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:228 -; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:224 -; CHECK-NEXT: buffer_load_dword v10, v1, s[0:3], 0 offen offset:220 -; CHECK-NEXT: buffer_load_dword v11, v1, s[0:3], 0 offen offset:216 -; CHECK-NEXT: buffer_load_dword v12, v1, s[0:3], 0 offen offset:212 -; CHECK-NEXT: buffer_load_dword v13, v1, s[0:3], 0 offen offset:208 -; CHECK-NEXT: buffer_load_dword v14, v1, s[0:3], 0 offen offset:204 -; CHECK-NEXT: buffer_load_dword v15, v1, s[0:3], 0 offen offset:200 -; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen offset:196 -; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:192 -; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:188 -; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:184 -; CHECK-NEXT: buffer_load_dword v20, v1, s[0:3], 0 offen offset:180 -; CHECK-NEXT: buffer_load_dword v21, v1, s[0:3], 0 offen offset:176 -; CHECK-NEXT: buffer_load_dword v22, v1, s[0:3], 0 offen offset:172 -; CHECK-NEXT: buffer_load_dword v23, v1, s[0:3], 0 offen offset:168 -; CHECK-NEXT: buffer_load_dword v24, v1, s[0:3], 0 offen offset:164 -; CHECK-NEXT: buffer_load_dword v25, v1, s[0:3], 0 offen offset:160 -; CHECK-NEXT: buffer_load_dword v26, v1, s[0:3], 0 offen offset:156 -; CHECK-NEXT: buffer_load_dword v27, v1, s[0:3], 0 offen offset:152 -; CHECK-NEXT: buffer_load_dword v28, v1, s[0:3], 0 offen offset:148 -; CHECK-NEXT: buffer_load_dword v29, v1, s[0:3], 0 offen offset:144 -; CHECK-NEXT: buffer_load_dword v30, v1, s[0:3], 0 offen offset:140 -; CHECK-NEXT: buffer_load_dword v31, v1, s[0:3], 0 offen offset:136 -; CHECK-NEXT: buffer_load_dword v32, v1, s[0:3], 0 offen offset:132 -; CHECK-NEXT: buffer_load_dword v33, v1, s[0:3], 0 offen offset:128 -; CHECK-NEXT: buffer_load_dword v34, v1, s[0:3], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v35, v1, s[0:3], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v36, v1, s[0:3], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v37, v1, s[0:3], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v38, v1, s[0:3], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v39, v1, s[0:3], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v48, v1, s[0:3], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v49, v1, s[0:3], 0 offen offset:96 -; CHECK-NEXT: buffer_load_dword v50, v1, s[0:3], 0 offen offset:92 -; CHECK-NEXT: buffer_load_dword v51, v1, s[0:3], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v52, v1, s[0:3], 0 offen offset:84 -; CHECK-NEXT: buffer_load_dword v53, v1, s[0:3], 0 offen offset:80 -; CHECK-NEXT: buffer_load_dword v54, v1, s[0:3], 0 offen offset:76 -; CHECK-NEXT: buffer_load_dword v55, v1, s[0:3], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v64, v1, s[0:3], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v65, v1, s[0:3], 0 offen offset:64 -; CHECK-NEXT: buffer_load_dword v66, v1, s[0:3], 0 offen offset:60 -; CHECK-NEXT: buffer_load_dword v67, v1, s[0:3], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v68, v1, s[0:3], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v69, v1, s[0:3], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v70, v1, s[0:3], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v71, v1, s[0:3], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v80, v1, s[0:3], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v81, v1, s[0:3], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v82, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v83, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v84, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v85, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v86, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_dword v87, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v96, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v97, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:2044 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2040 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2036 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2032 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:2028 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:2024 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:2020 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:2016 +; CHECK-NEXT: buffer_load_dword v10, v1, s[0:3], 0 offen offset:2012 +; CHECK-NEXT: buffer_load_dword v11, v1, s[0:3], 0 offen offset:2008 +; CHECK-NEXT: buffer_load_dword v12, v1, s[0:3], 0 offen offset:2004 +; CHECK-NEXT: buffer_load_dword v13, v1, s[0:3], 0 offen offset:2000 +; CHECK-NEXT: buffer_load_dword v14, v1, s[0:3], 0 offen offset:1996 +; CHECK-NEXT: buffer_load_dword v15, v1, s[0:3], 0 offen offset:1992 +; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen offset:1988 +; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:1984 +; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:1980 +; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:1976 +; CHECK-NEXT: buffer_load_dword v20, v1, s[0:3], 0 offen offset:1972 +; CHECK-NEXT: buffer_load_dword v21, v1, s[0:3], 0 offen offset:1968 +; CHECK-NEXT: buffer_load_dword v22, v1, s[0:3], 0 offen offset:1964 +; CHECK-NEXT: buffer_load_dword v23, v1, s[0:3], 0 offen offset:1960 +; CHECK-NEXT: buffer_load_dword v24, v1, s[0:3], 0 offen offset:1956 +; CHECK-NEXT: buffer_load_dword v25, v1, s[0:3], 0 offen offset:1952 +; CHECK-NEXT: buffer_load_dword v26, v1, s[0:3], 0 offen offset:1948 +; CHECK-NEXT: buffer_load_dword v27, v1, s[0:3], 0 offen offset:1944 +; CHECK-NEXT: buffer_load_dword v28, v1, s[0:3], 0 offen offset:1940 +; CHECK-NEXT: buffer_load_dword v29, v1, s[0:3], 0 offen offset:1936 +; CHECK-NEXT: buffer_load_dword v30, v1, s[0:3], 0 offen offset:1932 +; CHECK-NEXT: buffer_load_dword v31, v1, s[0:3], 0 offen offset:1928 +; CHECK-NEXT: buffer_load_dword v32, v1, s[0:3], 0 offen offset:1924 +; CHECK-NEXT: buffer_load_dword v33, v1, s[0:3], 0 offen offset:1920 +; CHECK-NEXT: buffer_load_dword v34, v1, s[0:3], 0 offen offset:1916 +; CHECK-NEXT: buffer_load_dword v35, v1, s[0:3], 0 offen offset:1912 +; CHECK-NEXT: buffer_load_dword v36, v1, s[0:3], 0 offen offset:1908 +; CHECK-NEXT: buffer_load_dword v37, v1, s[0:3], 0 offen offset:1904 +; CHECK-NEXT: buffer_load_dword v38, v1, s[0:3], 0 offen offset:1900 +; CHECK-NEXT: buffer_load_dword v39, v1, s[0:3], 0 offen offset:1896 +; CHECK-NEXT: buffer_load_dword v48, v1, s[0:3], 0 offen offset:1892 +; CHECK-NEXT: buffer_load_dword v49, v1, s[0:3], 0 offen offset:1888 +; CHECK-NEXT: buffer_load_dword v50, v1, s[0:3], 0 offen offset:1884 +; CHECK-NEXT: buffer_load_dword v51, v1, s[0:3], 0 offen offset:1880 +; CHECK-NEXT: buffer_load_dword v52, v1, s[0:3], 0 offen offset:1876 +; CHECK-NEXT: buffer_load_dword v53, v1, s[0:3], 0 offen offset:1872 +; CHECK-NEXT: buffer_load_dword v54, v1, s[0:3], 0 offen offset:1868 +; CHECK-NEXT: buffer_load_dword v55, v1, s[0:3], 0 offen offset:1864 +; CHECK-NEXT: buffer_load_dword v64, v1, s[0:3], 0 offen offset:1860 +; CHECK-NEXT: buffer_load_dword v65, v1, s[0:3], 0 offen offset:1856 +; CHECK-NEXT: buffer_load_dword v66, v1, s[0:3], 0 offen offset:1852 +; CHECK-NEXT: buffer_load_dword v67, v1, s[0:3], 0 offen offset:1848 +; CHECK-NEXT: buffer_load_dword v68, v1, s[0:3], 0 offen offset:1844 +; CHECK-NEXT: buffer_load_dword v69, v1, s[0:3], 0 offen offset:1840 +; CHECK-NEXT: buffer_load_dword v70, v1, s[0:3], 0 offen offset:1836 +; CHECK-NEXT: buffer_load_dword v71, v1, s[0:3], 0 offen offset:1832 +; CHECK-NEXT: buffer_load_dword v80, v1, s[0:3], 0 offen offset:1828 +; CHECK-NEXT: buffer_load_dword v81, v1, s[0:3], 0 offen offset:1824 +; CHECK-NEXT: buffer_load_dword v82, v1, s[0:3], 0 offen offset:1820 +; CHECK-NEXT: buffer_load_dword v83, v1, s[0:3], 0 offen offset:1816 +; CHECK-NEXT: buffer_load_dword v84, v1, s[0:3], 0 offen offset:1812 +; CHECK-NEXT: buffer_load_dword v85, v1, s[0:3], 0 offen offset:1808 +; CHECK-NEXT: buffer_load_dword v86, v1, s[0:3], 0 offen offset:1804 +; CHECK-NEXT: buffer_load_dword v87, v1, s[0:3], 0 offen offset:1800 +; CHECK-NEXT: buffer_load_dword v96, v1, s[0:3], 0 offen offset:1796 +; CHECK-NEXT: buffer_load_dword v97, v1, s[0:3], 0 offen offset:1792 ; CHECK-NEXT: v_add_nc_u32_e32 v1, 0xffffff00, v1 ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: s_waitcnt vmcnt(62) -; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252 -; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:248 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:2044 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2040 ; CHECK-NEXT: s_waitcnt vmcnt(61) -; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:244 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2036 ; CHECK-NEXT: s_waitcnt vmcnt(60) -; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:240 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2032 ; CHECK-NEXT: s_waitcnt vmcnt(59) -; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:236 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:2028 ; CHECK-NEXT: s_waitcnt vmcnt(58) -; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:232 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:2024 ; CHECK-NEXT: s_waitcnt vmcnt(57) -; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:228 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:2020 ; CHECK-NEXT: s_waitcnt vmcnt(56) -; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:2016 ; CHECK-NEXT: s_waitcnt vmcnt(55) -; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:220 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:2012 ; CHECK-NEXT: s_waitcnt vmcnt(54) -; CHECK-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 +; CHECK-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:2008 ; CHECK-NEXT: s_waitcnt vmcnt(53) -; CHECK-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:212 +; CHECK-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:2004 ; CHECK-NEXT: s_waitcnt vmcnt(52) -; CHECK-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:208 +; CHECK-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:2000 ; CHECK-NEXT: s_waitcnt vmcnt(51) -; CHECK-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204 +; CHECK-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:1996 ; CHECK-NEXT: s_waitcnt vmcnt(50) -; CHECK-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:200 +; CHECK-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:1992 ; CHECK-NEXT: s_waitcnt vmcnt(49) -; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:196 +; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:1988 ; CHECK-NEXT: s_waitcnt vmcnt(48) -; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:192 +; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:1984 ; CHECK-NEXT: s_waitcnt vmcnt(47) -; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:188 +; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:1980 ; CHECK-NEXT: s_waitcnt vmcnt(46) -; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:184 +; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:1976 ; CHECK-NEXT: s_waitcnt vmcnt(45) -; CHECK-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:180 +; CHECK-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:1972 ; CHECK-NEXT: s_waitcnt vmcnt(44) -; CHECK-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:1968 ; CHECK-NEXT: s_waitcnt vmcnt(43) -; CHECK-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:172 +; CHECK-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:1964 ; CHECK-NEXT: s_waitcnt vmcnt(42) -; CHECK-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:168 +; CHECK-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:1960 ; CHECK-NEXT: s_waitcnt vmcnt(41) -; CHECK-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:164 +; CHECK-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:1956 ; CHECK-NEXT: s_waitcnt vmcnt(40) -; CHECK-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:160 +; CHECK-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:1952 ; CHECK-NEXT: s_waitcnt vmcnt(39) -; CHECK-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:156 +; CHECK-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:1948 ; CHECK-NEXT: s_waitcnt vmcnt(38) -; CHECK-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:152 +; CHECK-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:1944 ; CHECK-NEXT: s_waitcnt vmcnt(37) -; CHECK-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:148 +; CHECK-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:1940 ; CHECK-NEXT: s_waitcnt vmcnt(36) -; CHECK-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:1936 ; CHECK-NEXT: s_waitcnt vmcnt(35) -; CHECK-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:140 +; CHECK-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:1932 ; CHECK-NEXT: s_waitcnt vmcnt(34) -; CHECK-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:136 +; CHECK-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:1928 ; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:132 +; CHECK-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:1924 ; CHECK-NEXT: s_waitcnt vmcnt(32) -; CHECK-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; CHECK-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:1920 ; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:1916 ; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:1912 ; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:1908 ; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:1904 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:1900 ; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:1896 ; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:1892 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:96 +; CHECK-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:1888 ; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:92 +; CHECK-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:1884 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:88 +; CHECK-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:1880 ; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:84 +; CHECK-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:1876 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:80 +; CHECK-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:1872 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:76 +; CHECK-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:1868 ; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:72 +; CHECK-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:1864 ; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_dword v64, v0, s[0:3], 0 offen offset:68 +; CHECK-NEXT: buffer_store_dword v64, v0, s[0:3], 0 offen offset:1860 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_dword v65, v0, s[0:3], 0 offen offset:64 +; CHECK-NEXT: buffer_store_dword v65, v0, s[0:3], 0 offen offset:1856 ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_dword v66, v0, s[0:3], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v66, v0, s[0:3], 0 offen offset:1852 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_dword v67, v0, s[0:3], 0 offen offset:56 +; CHECK-NEXT: buffer_store_dword v67, v0, s[0:3], 0 offen offset:1848 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_dword v68, v0, s[0:3], 0 offen offset:52 +; CHECK-NEXT: buffer_store_dword v68, v0, s[0:3], 0 offen offset:1844 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_dword v69, v0, s[0:3], 0 offen offset:48 +; CHECK-NEXT: buffer_store_dword v69, v0, s[0:3], 0 offen offset:1840 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_dword v70, v0, s[0:3], 0 offen offset:44 +; CHECK-NEXT: buffer_store_dword v70, v0, s[0:3], 0 offen offset:1836 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_dword v71, v0, s[0:3], 0 offen offset:40 +; CHECK-NEXT: buffer_store_dword v71, v0, s[0:3], 0 offen offset:1832 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_dword v80, v0, s[0:3], 0 offen offset:36 +; CHECK-NEXT: buffer_store_dword v80, v0, s[0:3], 0 offen offset:1828 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_dword v81, v0, s[0:3], 0 offen offset:32 +; CHECK-NEXT: buffer_store_dword v81, v0, s[0:3], 0 offen offset:1824 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_dword v82, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v82, v0, s[0:3], 0 offen offset:1820 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_dword v83, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v83, v0, s[0:3], 0 offen offset:1816 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v84, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v84, v0, s[0:3], 0 offen offset:1812 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_dword v85, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v85, v0, s[0:3], 0 offen offset:1808 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_dword v86, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v86, v0, s[0:3], 0 offen offset:1804 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_dword v87, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v87, v0, s[0:3], 0 offen offset:1800 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dword v96, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v96, v0, s[0:3], 0 offen offset:1796 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v97, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v97, v0, s[0:3], 0 offen offset:1792 ; CHECK-NEXT: v_add_nc_u32_e32 v0, 0xffffff00, v0 ; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB8_5 @@ -11145,1055 +11189,1053 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6 ; ALIGNED-NEXT: s_cbranch_execz .LBB8_6 ; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader -; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0x700, v1 -; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x700, v0 ; ALIGNED-NEXT: s_movk_i32 s4, 0xf800 ; ALIGNED-NEXT: s_mov_b32 s5, -1 ; ALIGNED-NEXT: .LBB8_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2047 ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2046 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2045 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2044 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2043 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2042 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2041 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2040 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2039 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2038 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2037 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2036 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2035 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2034 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2033 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2032 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2031 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2030 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2029 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2028 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2027 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2026 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2025 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2024 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2023 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2022 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2021 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2020 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2019 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2018 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2017 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2016 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2015 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2014 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2013 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2012 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2011 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2010 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2009 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2008 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2007 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2006 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2005 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2004 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2003 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2002 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2001 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2000 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1999 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1998 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1997 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1996 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1995 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1994 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1993 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1992 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:1991 +; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:1990 +; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:1989 +; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:1988 +; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:1987 +; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:1986 +; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:1985 +; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:1984 +; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:1983 +; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:1982 +; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:1981 +; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:1980 +; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:1979 +; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:1978 +; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:1977 +; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:1976 +; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:1975 +; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:1974 +; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:1973 +; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:1972 +; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:1971 +; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:1970 +; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:1969 +; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:1968 +; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:1967 +; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:1966 +; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:1965 +; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:1964 +; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:1963 +; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:1962 +; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:1961 +; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:1960 +; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:1959 +; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:1958 +; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:1957 +; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:1956 +; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:1955 +; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:1954 +; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:1953 +; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:1952 +; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:1951 +; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:1950 +; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:1949 +; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:1948 +; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:1947 +; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:1946 +; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:1945 +; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:1944 +; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:1943 +; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:1942 +; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:1941 +; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:1940 +; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:1939 +; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:1938 +; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:1937 +; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:1936 +; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:1935 +; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:1934 +; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:1933 +; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1932 +; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:1931 +; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:1930 +; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:1929 ; ALIGNED-NEXT: s_clause 0xa -; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:128 -; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:127 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:1928 +; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:1927 +; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:1926 +; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:1925 +; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:1924 +; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:1923 +; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:1922 +; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:1921 +; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:1920 +; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1919 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1918 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x34 -; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:124 -; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:123 -; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:122 -; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:121 -; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:120 -; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:119 -; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:118 -; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:117 -; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:116 -; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:115 -; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:114 -; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:113 -; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:112 -; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:111 -; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:110 -; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:109 -; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:108 -; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:107 -; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:106 -; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:105 -; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:104 -; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:103 -; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:102 -; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:101 -; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:100 -; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:99 -; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:98 -; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:97 -; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:96 -; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:95 -; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:94 -; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:93 -; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:92 -; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:91 -; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:90 -; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:89 -; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:88 -; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:84 -; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:83 -; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:81 -; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:80 -; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:78 -; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:76 -; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:74 -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:1917 +; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:1916 +; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:1915 +; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:1914 +; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:1913 +; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:1912 +; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:1911 +; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:1910 +; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:1909 +; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:1908 +; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:1907 +; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:1906 +; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:1905 +; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:1904 +; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:1903 +; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:1902 +; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:1901 +; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:1900 +; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:1899 +; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:1898 +; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:1897 +; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:1896 +; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:1895 +; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:1894 +; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:1893 +; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:1892 +; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:1891 +; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:1890 +; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:1889 +; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:1888 +; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:1887 +; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:1886 +; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:1885 +; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:1884 +; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:1883 +; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:1882 +; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:1881 +; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:1880 +; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:1879 +; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:1878 +; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:1877 +; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:1876 +; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:1875 +; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:1874 +; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:1873 +; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:1872 +; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:1871 +; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:1870 +; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:1869 +; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:1868 +; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:1867 +; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:1866 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1865 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1864 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1863 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1862 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1861 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1860 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1859 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1858 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1857 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1856 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1855 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1854 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1853 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1852 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1851 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1850 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1849 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1848 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1847 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1846 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1845 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1844 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1843 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1842 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1841 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1840 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1839 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1838 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1837 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1836 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1835 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1834 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1833 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1832 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1831 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1830 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1829 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1828 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1827 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1826 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1825 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1824 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1823 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1822 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1821 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1820 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1819 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1818 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1817 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1816 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1815 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1814 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1813 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1812 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1811 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1810 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1809 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1808 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1807 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1806 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1805 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1804 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1803 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1802 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1801 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1800 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1799 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1798 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1797 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1796 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1795 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1794 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1793 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1792 ; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0xffffff00, v1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2047 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2046 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2045 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2044 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2043 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2042 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2041 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2040 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2039 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2038 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2037 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2036 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2035 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2034 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2033 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2032 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2031 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2030 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2029 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2028 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2027 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2026 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2025 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2024 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2023 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2022 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2021 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2020 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:128 -; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2019 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2018 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2017 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2016 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2015 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2014 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2013 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2012 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2011 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2010 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2009 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2008 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2007 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2006 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2005 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2004 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2003 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2002 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2001 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2000 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1999 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1998 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1997 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1996 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1995 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1994 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1993 +; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1992 +; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:1991 +; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:1990 +; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:1989 +; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:1988 +; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:1987 +; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:1986 +; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:1985 +; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:1984 +; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:1983 +; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:1982 +; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:1981 +; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:1980 +; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:1979 +; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:1978 +; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:1977 +; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:1976 +; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:1975 +; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:1974 +; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:1973 +; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:1972 +; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:1971 +; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:1970 +; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:1969 +; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:1968 +; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:1967 +; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:1966 +; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:1965 +; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:1964 +; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:1963 +; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:1962 +; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:1961 +; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:1960 +; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:1959 +; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:1958 +; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:1957 +; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:1956 +; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:1955 +; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:1954 +; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:1953 +; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:1952 +; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:1951 +; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:1950 +; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1949 +; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1948 +; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:1947 +; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:1946 +; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:1945 +; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:1944 +; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:1943 +; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:1942 +; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:1941 +; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:1940 +; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:1939 +; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:1938 +; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:1937 +; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:1936 +; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:1935 +; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:1934 +; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1933 +; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1932 +; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:1931 +; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:1930 +; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:1929 +; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:1928 +; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:1927 +; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:1926 +; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:1925 +; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:1924 +; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:1923 +; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:1922 +; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:1921 +; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:1920 +; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:1919 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:126 -; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:124 -; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:123 -; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:122 -; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:121 -; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:120 -; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:119 -; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:118 -; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:117 -; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:116 -; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:115 -; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:114 -; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:113 -; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:112 -; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:111 -; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:110 -; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:109 -; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:108 -; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:107 -; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:106 -; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:105 -; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:104 -; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:103 -; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:102 -; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:101 -; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:100 -; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:99 -; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:98 -; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:97 -; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:96 -; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:95 -; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:94 -; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:93 -; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:92 -; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:91 -; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:90 -; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:89 -; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:88 -; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:84 -; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:83 -; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:81 -; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:80 -; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:78 -; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:76 -; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1918 +; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:1917 +; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:1916 +; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:1915 +; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:1914 +; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:1913 +; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:1912 +; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:1911 +; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:1910 +; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:1909 +; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:1908 +; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:1907 +; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:1906 +; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:1905 +; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:1904 +; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:1903 +; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:1902 +; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:1901 +; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:1900 +; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:1899 +; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:1898 +; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:1897 +; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:1896 +; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:1895 +; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:1894 +; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:1893 +; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:1892 +; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:1891 +; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:1890 +; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:1889 +; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:1888 +; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:1887 +; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:1886 +; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:1885 +; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:1884 +; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:1883 +; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:1882 +; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:1881 +; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:1880 +; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:1879 +; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:1878 +; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:1877 +; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:1876 +; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:1875 +; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:1874 +; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:1873 +; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:1872 +; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:1871 +; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:1870 +; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:1869 +; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:1868 +; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:1867 +; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:1866 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1865 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1864 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1863 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1862 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1861 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1860 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1859 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1858 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1857 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1856 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1855 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1854 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1853 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1852 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1851 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1850 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1849 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1848 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1847 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1846 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1845 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1844 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1843 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1842 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1841 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1840 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1839 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1838 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1837 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1836 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1835 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1834 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1833 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1832 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1831 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1830 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1829 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1828 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1827 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1826 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1825 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1824 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1823 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1822 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1821 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1820 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1819 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1818 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1817 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1816 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1815 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1814 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1813 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1812 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1811 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1810 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1809 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1808 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1807 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1806 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1805 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1804 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1803 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1802 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1801 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1800 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1799 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1798 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1797 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1796 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1795 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1794 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1793 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1792 ; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0xffffff00, v0 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB8_5 ; ALIGNED-NEXT: .LBB8_6: ; %Flow19 @@ -12355,63 +12397,61 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2032 ; UNROLL3-NEXT: s_clause 0x3 -; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2028 -; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2024 -; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2020 -; UNROLL3-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:2016 -; UNROLL3-NEXT: v_add_nc_u32_e32 v1, 0x7b0, v1 -; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v0 +; UNROLL3-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:2028 +; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2024 +; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2020 +; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2016 ; UNROLL3-NEXT: s_waitcnt vmcnt(3) -; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2028 +; UNROLL3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:2028 ; UNROLL3-NEXT: s_waitcnt vmcnt(2) -; UNROLL3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2024 +; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2024 ; UNROLL3-NEXT: s_waitcnt vmcnt(1) -; UNROLL3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2020 +; UNROLL3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2020 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:2016 +; UNROLL3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2016 ; UNROLL3-NEXT: .LBB8_6: ; %memmove_bwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: s_clause 0xb -; UNROLL3-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen offset:44 -; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:40 -; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:36 -; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:32 -; UNROLL3-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:28 -; UNROLL3-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:24 -; UNROLL3-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:20 -; UNROLL3-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:16 -; UNROLL3-NEXT: buffer_load_dword v10, v1, s[0:3], 0 offen offset:12 -; UNROLL3-NEXT: buffer_load_dword v11, v1, s[0:3], 0 offen offset:8 -; UNROLL3-NEXT: buffer_load_dword v12, v1, s[0:3], 0 offen offset:4 -; UNROLL3-NEXT: buffer_load_dword v13, v1, s[0:3], 0 offen +; UNROLL3-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:2012 +; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2008 +; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2004 +; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2000 +; UNROLL3-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:1996 +; UNROLL3-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:1992 +; UNROLL3-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:1988 +; UNROLL3-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:1984 +; UNROLL3-NEXT: buffer_load_dword v10, v1, s[0:3], 0 offen offset:1980 +; UNROLL3-NEXT: buffer_load_dword v11, v1, s[0:3], 0 offen offset:1976 +; UNROLL3-NEXT: buffer_load_dword v12, v1, s[0:3], 0 offen offset:1972 +; UNROLL3-NEXT: buffer_load_dword v13, v1, s[0:3], 0 offen offset:1968 ; UNROLL3-NEXT: v_subrev_nc_u32_e32 v1, 48, v1 ; UNROLL3-NEXT: s_add_u32 s4, s4, 48 ; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 ; UNROLL3-NEXT: s_waitcnt vmcnt(11) -; UNROLL3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen offset:44 +; UNROLL3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:2012 ; UNROLL3-NEXT: s_waitcnt vmcnt(10) -; UNROLL3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:40 +; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2008 ; UNROLL3-NEXT: s_waitcnt vmcnt(9) -; UNROLL3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:36 +; UNROLL3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2004 ; UNROLL3-NEXT: s_waitcnt vmcnt(8) -; UNROLL3-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:32 +; UNROLL3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2000 ; UNROLL3-NEXT: s_waitcnt vmcnt(7) -; UNROLL3-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen offset:28 +; UNROLL3-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:1996 ; UNROLL3-NEXT: s_waitcnt vmcnt(6) -; UNROLL3-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen offset:24 +; UNROLL3-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:1992 ; UNROLL3-NEXT: s_waitcnt vmcnt(5) -; UNROLL3-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen offset:20 +; UNROLL3-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:1988 ; UNROLL3-NEXT: s_waitcnt vmcnt(4) -; UNROLL3-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen offset:16 +; UNROLL3-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:1984 ; UNROLL3-NEXT: s_waitcnt vmcnt(3) -; UNROLL3-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen offset:12 +; UNROLL3-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:1980 ; UNROLL3-NEXT: s_waitcnt vmcnt(2) -; UNROLL3-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:1976 ; UNROLL3-NEXT: s_waitcnt vmcnt(1) -; UNROLL3-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:1972 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; UNROLL3-NEXT: v_subrev_nc_u32_e32 v2, 48, v2 +; UNROLL3-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:1968 +; UNROLL3-NEXT: v_subrev_nc_u32_e32 v0, 48, v0 ; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc0 .LBB8_6 ; UNROLL3-NEXT: .LBB8_7: ; %Flow17 @@ -12427,13 +12467,14 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: s_mov_b32 s6, exec_lo +; CHECK-NEXT: s_mov_b32 s4, exec_lo ; CHECK-NEXT: v_cndmask_b32_e32 v3, -1, v0, vcc_lo ; CHECK-NEXT: v_cmpx_ge_u32_e64 v2, v3 -; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6 -; CHECK-NEXT: s_cbranch_execz .LBB9_2 -; CHECK-NEXT: .LBB9_1: ; %memmove_fwd_loop +; CHECK-NEXT: s_xor_b32 s6, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execz .LBB9_3 +; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader +; CHECK-NEXT: s_mov_b64 s[4:5], 0x800 +; CHECK-NEXT: .LBB9_2: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32 @@ -12500,145 +12541,143 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:24 ; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo -; CHECK-NEXT: s_add_u32 s4, s4, 0x100 ; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 -; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: s_addc_u32 s5, s5, -1 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[64:67] offset:240 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:224 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:208 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:176 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[52:55] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[48:51] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[35:38] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[31:34] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[27:30] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[80:83] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[68:71] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[23:26] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[19:22] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[15:18] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[11:14] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[96:99] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] -; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 -; CHECK-NEXT: s_cbranch_scc1 .LBB9_1 -; CHECK-NEXT: .LBB9_2: ; %Flow10 -; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6 -; CHECK-NEXT: s_cbranch_execz .LBB9_5 -; CHECK-NEXT: ; %bb.3: ; %memmove_bwd_loop.preheader -; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x700, v2 -; CHECK-NEXT: s_movk_i32 s6, 0xff00 -; CHECK-NEXT: s_mov_b64 s[4:5], 0x700 -; CHECK-NEXT: s_mov_b32 s7, -1 -; CHECK-NEXT: .LBB9_4: ; %memmove_bwd_loop +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[84:87] +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB9_2 +; CHECK-NEXT: .LBB9_3: ; %Flow16 +; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6 +; CHECK-NEXT: s_cbranch_execz .LBB9_6 +; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x700, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; CHECK-NEXT: s_movk_i32 s4, 0xf800 +; CHECK-NEXT: s_mov_b32 s5, -1 +; CHECK-NEXT: .LBB9_5: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_clause 0x3e -; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60 -; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76 -; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92 -; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84 -; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80 -; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64 -; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 -; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236 -; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:252 -; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:248 -; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:244 -; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:240 -; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:232 -; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:228 -; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:224 -; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204 -; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:220 -; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:216 -; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:212 -; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:208 -; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200 -; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196 -; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192 -; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172 -; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:188 -; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:184 -; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:180 -; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:176 -; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168 -; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164 -; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160 -; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:156 -; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:152 -; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:148 -; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:144 -; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:140 -; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:136 -; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:132 -; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:128 -; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:1824 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:1828 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:1832 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:1836 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:1840 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:1844 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:1848 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:1852 +; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:1868 +; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:1884 +; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:1880 +; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:1876 +; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:1872 +; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:1864 +; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:1860 +; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:1856 +; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:1900 +; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:1916 +; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:1912 +; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:1908 +; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:1904 +; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:1896 +; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:1892 +; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:1888 +; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:1964 +; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:1980 +; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:1976 +; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:1972 +; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:1968 +; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:1960 +; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:1956 +; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:1952 +; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:2028 +; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:2044 +; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:2040 +; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:2036 +; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:2032 +; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:2024 +; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:2020 +; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:2016 +; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:1996 +; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:2012 +; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:2008 +; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:2004 +; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:2000 +; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:1992 +; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:1988 +; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:1984 +; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:1948 +; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:1944 +; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:1940 +; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:1936 +; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:1932 +; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:1928 +; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:1924 +; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:1920 +; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen offset:1792 +; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:1796 +; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:1800 +; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen offset:1808 +; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:1812 +; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:1816 +; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:1820 +; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:1804 ; CHECK-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2 -; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 -; CHECK-NEXT: s_addc_u32 s5, s5, -1 -; CHECK-NEXT: s_waitcnt vmcnt(35) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:240 -; CHECK-NEXT: s_waitcnt vmcnt(32) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:224 +; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:208 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:176 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:160 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:144 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[48:51] offset:240 +; CHECK-NEXT: s_waitcnt vmcnt(24) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[35:38] offset:224 +; CHECK-NEXT: s_waitcnt vmcnt(19) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[64:67] offset:208 +; CHECK-NEXT: s_waitcnt vmcnt(16) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[52:55] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[31:34] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[27:30] offset:160 +; CHECK-NEXT: s_waitcnt vmcnt(12) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[68:71] offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[80:83] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[23:26] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[19:22] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[15:18] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[11:14] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[96:99] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] -; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_scc0 .LBB9_4 -; CHECK-NEXT: .LBB9_5: ; %Flow11 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[84:87] +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffff00, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0 +; CHECK-NEXT: s_cbranch_scc0 .LBB9_5 +; CHECK-NEXT: .LBB9_6: ; %Flow17 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] ; @@ -12694,3100 +12733,3109 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 -; ALIGNED-NEXT: s_mov_b32 s6, exec_lo -; ALIGNED-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; ALIGNED-NEXT: v_cmpx_ge_u32_e64 v2, v0 -; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s6 -; ALIGNED-NEXT: s_cbranch_execz .LBB9_2 -; ALIGNED-NEXT: .LBB9_1: ; %memmove_fwd_loop +; ALIGNED-NEXT: s_mov_b32 s4, exec_lo +; ALIGNED-NEXT: v_cndmask_b32_e32 v3, -1, v0, vcc_lo +; ALIGNED-NEXT: v_cmpx_ge_u32_e64 v2, v3 +; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 +; ALIGNED-NEXT: s_cbranch_execz .LBB9_3 +; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x800 +; ALIGNED-NEXT: .LBB9_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: s_clause 0x39 -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:26 ; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31 -; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32 -; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33 -; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:38 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:39 -; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:40 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:41 -; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:42 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:43 -; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:44 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:45 -; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:46 -; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:47 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:48 -; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:49 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:50 -; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:51 -; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:52 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:56 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 -; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65 -; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59 -; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:68 -; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:69 -; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:70 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76 -; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 +; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 +; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0 +; ALIGNED-NEXT: s_waitcnt vmcnt(58) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(57) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(56) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(55) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(54) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(53) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(52) -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(51) -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(50) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(49) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(48) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(47) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(46) -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(45) -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(42) -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v9, 8, v7 +; ALIGNED-NEXT: s_waitcnt vmcnt(43) +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v10 +; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v7, v12, 8, v8 +; ALIGNED-NEXT: v_lshl_or_b32 v8, v13, 8, v14 +; ALIGNED-NEXT: v_lshl_or_b32 v9, v17, 8, v16 +; ALIGNED-NEXT: v_lshl_or_b32 v10, v21, 8, v19 ; ALIGNED-NEXT: s_waitcnt vmcnt(40) -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 -; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 -; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 -; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(39) -; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(37) -; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(35) -; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(33) -; ALIGNED-NEXT: v_lshl_or_b32 v12, v27, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(31) -; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(29) -; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7 -; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 -; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(27) -; ALIGNED-NEXT: v_lshl_or_b32 v15, v30, 8, v29 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(25) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v34 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(23) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v36, 8, v31 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(21) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v33 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(16) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v37 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v38 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v11, v18, 8, v15 +; ALIGNED-NEXT: s_waitcnt vmcnt(38) +; ALIGNED-NEXT: v_lshl_or_b32 v12, v22, 8, v20 +; ALIGNED-NEXT: s_waitcnt vmcnt(36) +; ALIGNED-NEXT: v_lshl_or_b32 v13, v25, 8, v24 +; ALIGNED-NEXT: s_waitcnt vmcnt(34) +; ALIGNED-NEXT: v_lshl_or_b32 v14, v29, 8, v27 +; ALIGNED-NEXT: s_waitcnt vmcnt(32) +; ALIGNED-NEXT: v_lshl_or_b32 v15, v26, 8, v23 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(30) +; ALIGNED-NEXT: v_lshl_or_b32 v16, v30, 8, v28 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 16, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v8, 16, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v10, 16, v9 +; ALIGNED-NEXT: v_lshl_or_b32 v7, v12, 16, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v8, v14, 16, v13 +; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 16, v15 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(28) +; ALIGNED-NEXT: v_lshl_or_b32 v17, v32, 8, v31 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(26) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v35 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(24) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v38, 8, v34 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(22) +; ALIGNED-NEXT: v_lshl_or_b32 v5, v37, 8, v36 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(17) ; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v39 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v7, v51, 8, v52 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v54, 8, v53 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v55, 8, v65 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v64 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v67 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(15) +; ALIGNED-NEXT: v_lshl_or_b32 v7, v49, 8, v48 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v8, v52, 8, v51 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: v_lshl_or_b32 v9, v53, 8, v54 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v3, 16, v17 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v7, 16, v6 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v9, 16, v8 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v55 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v67 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: v_lshl_or_b32 v5, v68, 8, v66 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshl_or_b32 v6, v70, 8, v69 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 16, v5 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v80, 8, v71 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v82, 8, v81 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v83, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:98 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:102 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:103 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:94 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:95 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:93 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:92 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:90 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:89 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:99 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:100 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:97 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:96 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:114 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:118 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:119 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v7 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:110 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:111 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:109 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:107 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:108 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:106 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:117 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:105 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:104 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:115 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:116 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:113 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:112 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:126 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:127 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:123 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:125 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:123 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:122 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:121 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:120 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:129 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:140 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:136 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:144 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v125, 8, v3 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v121 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v126, 8, v124 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 8, v120 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v121, 8, v123 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v104 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v107 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v111 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v108, 8, v109 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v107, 8, v120 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v111, 8, v122 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:171 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v92 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v95, 8, v105 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v88, 8, v90 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v93, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:170 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v79 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v90, 8, v91 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v75 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v92, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:183 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v77, 8, v79 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v61 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v59, 8, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v74, 8, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v72, 8, v76 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v75, 8, v78 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:187 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v57 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v61, 8, v62 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v56 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v59, 8, v60 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:186 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v56, 8, v57 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v42 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v58, 8, v47 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: s_clause 0x5 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:7 ; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v117, 8, v40 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v114, 8, v113 -; ALIGNED-NEXT: v_lshl_or_b32 v110, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v115, 8, v118 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v43, 8, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v118, 8, v119 +; ALIGNED-NEXT: v_lshl_or_b32 v110, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v41, 8, v44 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v117, 8, v40 +; ALIGNED-NEXT: v_lshl_or_b32 v106, v68, 16, v67 ; ALIGNED-NEXT: s_waitcnt vmcnt(61) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v112, 8, v116 -; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v113, 8, v115 ; ALIGNED-NEXT: s_waitcnt vmcnt(59) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v101, 8, v102 -; ALIGNED-NEXT: s_waitcnt vmcnt(57) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v99, 8, v100 -; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v16, 8, v20 -; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v11, 8, v12 -; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v96, 8, v98 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v87 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v85 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v83 -; ALIGNED-NEXT: v_lshl_or_b32 v58, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v84 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v66, 8, v65 -; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v82 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v68 -; ALIGNED-NEXT: v_lshl_or_b32 v119, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v52 -; ALIGNED-NEXT: v_lshl_or_b32 v103, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v50 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v37 -; ALIGNED-NEXT: v_lshl_or_b32 v86, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v35, 8, v38 -; ALIGNED-NEXT: v_lshl_or_b32 v70, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v34 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32 -; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25 -; ALIGNED-NEXT: v_lshl_or_b32 v53, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17 -; ALIGNED-NEXT: v_lshl_or_b32 v31, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v23 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; ALIGNED-NEXT: v_lshl_or_b32 v26, v95, 16, v4 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v13, 8, v14 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v112, 8, v114 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v21, v109, 16, v95 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v8, 8, v10 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v9, 8, v7 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v15, v109, 16, v95 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v95, v95, 8, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v5, 8, v125 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v4, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v94, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v101, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v103, 8, v100 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v89, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v87, 8, v98 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v86, 8, v96 +; ALIGNED-NEXT: v_lshl_or_b32 v63, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v84, 8, v97 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v80, 8, v71 +; ALIGNED-NEXT: v_lshl_or_b32 v46, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v83, 8, v85 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v69, 8, v81 +; ALIGNED-NEXT: v_lshl_or_b32 v42, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v54, 8, v66 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v82, 8, v64 +; ALIGNED-NEXT: v_lshl_or_b32 v116, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v51, 8, v53 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v48, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v99, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v49, 8, v52 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v38, 8, v50 +; ALIGNED-NEXT: v_lshl_or_b32 v70, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v35, 8, v37 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v34, 8, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v65, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v30, 8, v32 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v31, 8, v29 +; ALIGNED-NEXT: v_lshl_or_b32 v55, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v25, 8, v27 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v20, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v33, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v23, 8, v26 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v19, 8, v22 +; ALIGNED-NEXT: v_lshl_or_b32 v28, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v15, 8, v17 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v14, 8, v16 +; ALIGNED-NEXT: v_lshl_or_b32 v24, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v13, 8, v10 +; ALIGNED-NEXT: v_lshl_or_b32 v18, v68, 16, v67 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v3, 8, v4 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v67, v67, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 16, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v7, 8, v9 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v6, 8, v8 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 16, v67 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v4, 8, v6 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v68, v5, 8, v8 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 16, v67 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v5, 8, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v67, v5, 8, v9 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v68, v7, 8, v4 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_mov_b32_e32 v5, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 16, v67 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:17 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 -; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v109 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v127, v95, 8, v125 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_add_co_u32 v3, vcc_lo, v3, s4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v4, vcc_lo -; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:250 -; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:251 -; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:249 -; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:255 -; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:253 -; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:254 -; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:252 -; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:248 -; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:242 -; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:243 -; ALIGNED-NEXT: flat_store_byte v[3:4], v22 offset:241 -; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:247 -; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:245 -; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:246 -; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:244 -; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:240 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: flat_store_byte v[0:1], v10 offset:250 +; ALIGNED-NEXT: flat_store_byte v[0:1], v13 offset:251 +; ALIGNED-NEXT: flat_store_byte v[0:1], v11 offset:249 +; ALIGNED-NEXT: flat_store_byte v[0:1], v14 offset:255 +; ALIGNED-NEXT: flat_store_byte v[0:1], v15 offset:253 +; ALIGNED-NEXT: flat_store_byte v[0:1], v16 offset:254 +; ALIGNED-NEXT: flat_store_byte v[0:1], v17 offset:252 +; ALIGNED-NEXT: flat_store_byte v[0:1], v12 offset:248 +; ALIGNED-NEXT: flat_store_byte v[0:1], v21 offset:242 +; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:243 +; ALIGNED-NEXT: flat_store_byte v[0:1], v25 offset:241 +; ALIGNED-NEXT: flat_store_byte v[0:1], v19 offset:247 +; ALIGNED-NEXT: flat_store_byte v[0:1], v23 offset:245 +; ALIGNED-NEXT: flat_store_byte v[0:1], v22 offset:246 +; ALIGNED-NEXT: flat_store_byte v[0:1], v26 offset:244 +; ALIGNED-NEXT: flat_store_byte v[0:1], v27 offset:240 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:234 -; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:235 -; ALIGNED-NEXT: flat_store_byte v[3:4], v27 offset:233 -; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:239 -; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:237 -; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:238 -; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:236 -; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:232 -; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:226 -; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:227 -; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:225 -; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:231 -; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:229 -; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:230 -; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:228 -; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:224 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:213 -; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:215 -; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:209 -; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:211 -; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:210 -; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:214 -; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:212 -; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:218 -; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:219 -; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:217 -; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:223 -; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:221 -; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:222 -; ALIGNED-NEXT: flat_store_byte v[3:4], v82 offset:220 -; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:216 -; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:208 -; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:234 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:235 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:233 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:239 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:237 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:238 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:236 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:232 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:226 +; ALIGNED-NEXT: flat_store_byte v[0:1], v48 offset:227 +; ALIGNED-NEXT: flat_store_byte v[0:1], v51 offset:225 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:231 +; ALIGNED-NEXT: flat_store_byte v[0:1], v49 offset:229 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:230 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:228 +; ALIGNED-NEXT: flat_store_byte v[0:1], v53 offset:224 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:209 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:211 +; ALIGNED-NEXT: flat_store_byte v[0:1], v64 offset:210 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:213 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:215 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:214 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:212 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:218 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:219 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:217 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:223 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:221 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:222 +; ALIGNED-NEXT: flat_store_byte v[0:1], v85 offset:220 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:216 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:208 +; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:202 -; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:203 -; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:201 -; ALIGNED-NEXT: flat_store_byte v[3:4], v99 offset:207 -; ALIGNED-NEXT: flat_store_byte v[3:4], v101 offset:205 -; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:206 -; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:204 -; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:200 -; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:194 -; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:195 -; ALIGNED-NEXT: flat_store_byte v[3:4], v117 offset:193 -; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:199 -; ALIGNED-NEXT: flat_store_byte v[3:4], v115 offset:197 -; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:198 -; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:192 -; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 -; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 -; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:186 -; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:187 -; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:185 -; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:191 -; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:189 -; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:190 -; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:188 -; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:184 -; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:178 -; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:179 -; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:177 -; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:183 -; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:181 -; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:182 -; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:180 -; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:202 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:203 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:201 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:207 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:205 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:206 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:204 +; ALIGNED-NEXT: flat_store_byte v[0:1], v102 offset:200 +; ALIGNED-NEXT: flat_store_byte v[0:1], v119 offset:194 +; ALIGNED-NEXT: flat_store_byte v[0:1], v118 offset:195 +; ALIGNED-NEXT: flat_store_byte v[0:1], v43 offset:193 +; ALIGNED-NEXT: flat_store_byte v[0:1], v117 offset:199 +; ALIGNED-NEXT: flat_store_byte v[0:1], v41 offset:197 +; ALIGNED-NEXT: flat_store_byte v[0:1], v40 offset:198 +; ALIGNED-NEXT: flat_store_byte v[0:1], v44 offset:196 +; ALIGNED-NEXT: flat_store_byte v[0:1], v45 offset:192 +; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v127, 8, v68 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v127, v67, 8, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v3, 16, v127 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: flat_store_byte v[0:1], v47 offset:186 +; ALIGNED-NEXT: flat_store_byte v[0:1], v58 offset:187 +; ALIGNED-NEXT: flat_store_byte v[0:1], v56 offset:185 +; ALIGNED-NEXT: flat_store_byte v[0:1], v59 offset:191 +; ALIGNED-NEXT: flat_store_byte v[0:1], v61 offset:189 +; ALIGNED-NEXT: flat_store_byte v[0:1], v60 offset:190 +; ALIGNED-NEXT: flat_store_byte v[0:1], v62 offset:188 +; ALIGNED-NEXT: flat_store_byte v[0:1], v57 offset:184 +; ALIGNED-NEXT: flat_store_byte v[0:1], v73 offset:178 +; ALIGNED-NEXT: flat_store_byte v[0:1], v74 offset:179 +; ALIGNED-NEXT: flat_store_byte v[0:1], v77 offset:177 +; ALIGNED-NEXT: flat_store_byte v[0:1], v72 offset:183 +; ALIGNED-NEXT: flat_store_byte v[0:1], v75 offset:181 +; ALIGNED-NEXT: flat_store_byte v[0:1], v76 offset:182 +; ALIGNED-NEXT: flat_store_byte v[0:1], v78 offset:180 +; ALIGNED-NEXT: flat_store_byte v[0:1], v79 offset:176 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:170 -; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:171 -; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:169 -; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:175 -; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:173 -; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:174 -; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:172 -; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:168 -; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:162 -; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:163 -; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:161 -; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:167 -; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:165 -; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:166 -; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:164 -; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:160 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: flat_store_byte v[0:1], v88 offset:170 +; ALIGNED-NEXT: flat_store_byte v[0:1], v92 offset:171 +; ALIGNED-NEXT: flat_store_byte v[0:1], v90 offset:169 +; ALIGNED-NEXT: flat_store_byte v[0:1], v93 offset:175 +; ALIGNED-NEXT: flat_store_byte v[0:1], v95 offset:173 +; ALIGNED-NEXT: flat_store_byte v[0:1], v104 offset:174 +; ALIGNED-NEXT: flat_store_byte v[0:1], v105 offset:172 +; ALIGNED-NEXT: flat_store_byte v[0:1], v91 offset:168 +; ALIGNED-NEXT: flat_store_byte v[0:1], v109 offset:162 +; ALIGNED-NEXT: flat_store_byte v[0:1], v108 offset:163 +; ALIGNED-NEXT: flat_store_byte v[0:1], v121 offset:161 +; ALIGNED-NEXT: flat_store_byte v[0:1], v107 offset:167 +; ALIGNED-NEXT: flat_store_byte v[0:1], v111 offset:165 +; ALIGNED-NEXT: flat_store_byte v[0:1], v120 offset:166 +; ALIGNED-NEXT: flat_store_byte v[0:1], v122 offset:164 +; ALIGNED-NEXT: flat_store_byte v[0:1], v123 offset:160 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:153 -; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:159 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: flat_store_byte v[0:1], v124 offset:154 +; ALIGNED-NEXT: flat_store_byte v[0:1], v126 offset:155 +; ALIGNED-NEXT: flat_store_byte v[0:1], v125 offset:153 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:158 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:159 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:156 -; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:152 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:157 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:146 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:158 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:147 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:156 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:145 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:152 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:151 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:146 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:149 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:147 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:145 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:148 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:151 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:144 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:149 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:150 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:148 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:144 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:138 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:139 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:137 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:143 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:138 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:141 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:139 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:142 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:137 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:140 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:143 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:136 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:141 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:130 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:142 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:131 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:140 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:129 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:136 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:135 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:130 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:133 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:131 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:134 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:129 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:132 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:135 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:128 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:133 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:134 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:132 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:128 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:122 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:123 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:121 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:127 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:122 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1232 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:125 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:123 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:126 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:121 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:124 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:127 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:120 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:125 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:114 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:126 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:115 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:124 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:113 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:120 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:119 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:114 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:117 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:115 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:118 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:113 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:116 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:119 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:112 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:117 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:118 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:380 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:116 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:112 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:106 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:107 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:105 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:111 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:106 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:109 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:107 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:110 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:105 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:108 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:111 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:104 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:109 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:98 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:110 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:99 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:108 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:97 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:104 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:103 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:98 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:101 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:99 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:102 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:97 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:100 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:103 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:96 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:101 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:102 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:100 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:96 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:90 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:91 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:89 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:95 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:90 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:93 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:91 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:94 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:89 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:92 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:95 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:88 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:93 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:82 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:94 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:83 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:92 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:81 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:88 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:87 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:82 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:85 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:83 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:86 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:81 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:84 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:87 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:80 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:85 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:86 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:84 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:80 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:74 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:75 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:73 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:79 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:74 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:77 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:75 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:78 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:73 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:76 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:79 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:72 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:77 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:66 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:78 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:67 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:76 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:65 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:72 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:71 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:66 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:69 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:67 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:70 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:65 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:68 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:71 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:64 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:69 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:70 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:68 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:64 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:61 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:58 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:59 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:57 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:61 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:63 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:58 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:62 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:59 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:60 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:57 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:56 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:63 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:53 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:62 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:50 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:60 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:51 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:56 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:49 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:53 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:55 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:50 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:54 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:51 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:52 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:49 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:48 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:55 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:54 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:52 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:48 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:43 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:42 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:41 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:40 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:43 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:47 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:42 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:46 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:41 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:45 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:40 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:44 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:47 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:35 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:46 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:34 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:45 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:33 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:44 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:32 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:35 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:39 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:34 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:38 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:33 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:37 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:32 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:36 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:39 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:38 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:37 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:36 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:27 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:25 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:31 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:26 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:29 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:27 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:30 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:25 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:28 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:31 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:18 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:29 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:17 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:30 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:28 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:21 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:24 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:18 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:22 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:19 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:17 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:16 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:23 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:21 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:22 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:20 +; ALIGNED-NEXT: flat_store_byte v[0:1], v4 offset:16 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:10 -; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:11 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:9 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:15 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: flat_store_byte v[0:1], v5 offset:10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v7 offset:11 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:14 -; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:12 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:13 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:8 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:9 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:15 +; ALIGNED-NEXT: flat_store_byte v[0:1], v8 offset:14 +; ALIGNED-NEXT: flat_store_byte v[0:1], v6 offset:12 +; ALIGNED-NEXT: flat_store_byte v[0:1], v9 offset:8 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:3 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:2 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:1 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:3 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:7 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:1 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:5 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:7 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:6 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:5 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:4 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:6 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 -; ALIGNED-NEXT: s_cbranch_scc1 .LBB9_1 -; ALIGNED-NEXT: .LBB9_2: ; %Flow10 -; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 -; ALIGNED-NEXT: s_cbranch_execz .LBB9_5 -; ALIGNED-NEXT: ; %bb.3: ; %memmove_bwd_loop.preheader -; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0x700, v2 -; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 -; ALIGNED-NEXT: s_mov_b32 s7, -1 -; ALIGNED-NEXT: .LBB9_4: ; %memmove_bwd_loop +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:4 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 +; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; ALIGNED-NEXT: s_cbranch_scc1 .LBB9_2 +; ALIGNED-NEXT: .LBB9_3: ; %Flow16 +; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6 +; ALIGNED-NEXT: s_cbranch_execz .LBB9_6 +; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader +; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0x700, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; ALIGNED-NEXT: s_movk_i32 s4, 0xf800 +; ALIGNED-NEXT: s_mov_b32 s5, -1 +; ALIGNED-NEXT: .LBB9_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: s_clause 0x39 -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:23 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24 -; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25 -; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:19 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:29 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:30 -; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:31 -; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:32 -; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:33 -; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:34 -; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27 -; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:35 -; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:36 -; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:37 -; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:38 -; ALIGNED-NEXT: buffer_load_ubyte v20, v4, s[0:3], 0 offen offset:39 -; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:40 -; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:41 -; ALIGNED-NEXT: buffer_load_ubyte v25, v4, s[0:3], 0 offen offset:42 -; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:43 -; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:44 -; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:45 -; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:46 -; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:47 -; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:48 -; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:49 -; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:50 -; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:51 -; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:52 -; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:53 -; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:54 -; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58 -; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:60 -; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:61 -; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:62 -; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:63 -; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:64 -; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:65 -; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:66 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59 -; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:67 -; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:68 -; ALIGNED-NEXT: buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:69 -; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:70 -; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:71 -; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:76 -; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78 -; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1812 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1813 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1814 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1815 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1816 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:1817 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:1818 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:1811 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1820 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1821 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:1822 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:1823 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:1824 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1825 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:1826 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:1819 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:1827 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:1828 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:1829 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:1830 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:1831 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:1832 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:1833 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:1834 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:1835 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:1836 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:1837 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:1838 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:1839 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:1840 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:1841 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:1842 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:1843 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:1844 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:1845 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:1846 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:1847 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:1848 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:1849 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:1850 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:1852 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:1853 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:1854 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:1855 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:1856 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:1857 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:1858 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:1851 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:1859 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:1860 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:1861 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:1862 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:1863 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:1868 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:1869 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:1867 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:1870 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:1871 +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(57) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(56) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(55) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(54) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(53) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(52) -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(51) -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 ; ALIGNED-NEXT: s_waitcnt vmcnt(49) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(48) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(47) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(46) -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(45) -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v5 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v9, 8, v7 ; ALIGNED-NEXT: s_waitcnt vmcnt(42) -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v10 ; ALIGNED-NEXT: s_waitcnt vmcnt(40) -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 -; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 -; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 -; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v7, v12, 8, v8 +; ALIGNED-NEXT: v_lshl_or_b32 v8, v13, 8, v14 +; ALIGNED-NEXT: v_lshl_or_b32 v9, v17, 8, v16 +; ALIGNED-NEXT: v_lshl_or_b32 v10, v21, 8, v19 ; ALIGNED-NEXT: s_waitcnt vmcnt(39) -; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 +; ALIGNED-NEXT: v_lshl_or_b32 v11, v18, 8, v15 ; ALIGNED-NEXT: s_waitcnt vmcnt(37) -; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 +; ALIGNED-NEXT: v_lshl_or_b32 v12, v22, 8, v20 ; ALIGNED-NEXT: s_waitcnt vmcnt(35) -; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 +; ALIGNED-NEXT: v_lshl_or_b32 v13, v26, 8, v24 ; ALIGNED-NEXT: s_waitcnt vmcnt(33) -; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 +; ALIGNED-NEXT: v_lshl_or_b32 v14, v29, 8, v28 ; ALIGNED-NEXT: s_waitcnt vmcnt(31) -; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v15, v25, 8, v23 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_waitcnt vmcnt(29) -; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 -; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v16, v30, 8, v27 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 16, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v8, 16, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v10, 16, v9 +; ALIGNED-NEXT: v_lshl_or_b32 v7, v12, 16, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v8, v14, 16, v13 +; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 16, v15 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(27) -; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v17, v33, 8, v32 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(25) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(23) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v39, 8, v34 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(21) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v5, v38, 8, v37 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(16) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v38 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v6, v53, 8, v48 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v7, v51, 8, v49 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v8, v52, 8, v50 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v5, 16, v3 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v9, v55, 8, v54 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v3, 16, v17 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v7, 16, v6 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1877 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v9, 16, v8 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v31 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v68 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v5, v66, 8, v64 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v70, 8, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1878 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1874 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 16, v5 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v80, 8, v71 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1875 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1866 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:73 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:72 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v82, 8, v81 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1865 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1864 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:1879 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:84 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:81 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v83, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1876 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1873 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1872 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:98 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1890 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:102 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:103 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1894 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:1895 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:94 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:95 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1886 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1887 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:93 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:91 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1885 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1883 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1884 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:90 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:101 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:89 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1882 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1893 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1881 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1880 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:99 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:100 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:97 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1891 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1892 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1889 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1888 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:114 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1906 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:118 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:119 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1910 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:1911 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:110 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:111 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1902 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1903 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:109 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:107 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1901 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1899 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1900 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:106 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:117 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:105 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1898 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1909 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1897 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1896 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:115 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:116 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:113 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1907 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1908 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1905 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1904 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:130 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1922 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:135 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1926 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:1927 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:126 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:127 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1918 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1919 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:123 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1917 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1915 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1916 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:122 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:133 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:121 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1914 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1925 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1913 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1912 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:132 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:129 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1923 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1924 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1921 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1920 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1938 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1942 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:1943 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:143 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1934 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1935 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1933 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1931 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1932 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1930 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1941 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1929 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1928 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1939 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1940 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1937 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1936 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v9 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v8 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1950 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1951 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1949 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1947 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1948 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:1944 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:1945 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:1946 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v123 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v124, 8, v127 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v122 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:1952 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:1953 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1954 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:1955 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:1956 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:1957 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:1958 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:1959 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v108, 8, v110 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v107 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v104, 8, v95 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v93, 8, v106 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v107, 8, v109 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:1964 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:1965 +; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:1966 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:1967 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:1963 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v90, 8, v91 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v79, 8, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:1960 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:1961 +; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:1962 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v76, 8, v78 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v77, 8, v74 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:1968 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:1969 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:1970 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:1971 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:1972 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:1973 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:1974 +; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:1975 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v63, 8, v73 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v59, 8, v60 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v58, 8, v61 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v62, 8, v72 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:1980 +; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:1981 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:1982 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:1983 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:1979 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v46, 8, v56 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v45, 8, v47 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:1976 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:1977 +; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:1978 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v42, 8, v44 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v43, 8, v41 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1f +; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:1984 +; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:1985 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:1986 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:1987 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:1988 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:1989 +; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:1990 +; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:1991 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:1996 +; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:1997 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:1998 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:1999 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:1995 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:1992 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:1993 +; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:1994 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:2004 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:2005 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:2006 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:2007 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:2003 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:2008 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:2009 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:2010 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:2011 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:2012 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:2013 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:2014 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:2015 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:2000 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:2001 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:2002 +; ALIGNED-NEXT: s_waitcnt vmcnt(30) +; ALIGNED-NEXT: v_lshl_or_b32 v35, v117, 8, v119 +; ALIGNED-NEXT: s_waitcnt vmcnt(28) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v113, 8, v112 +; ALIGNED-NEXT: s_waitcnt vmcnt(20) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v98, 8, v100 +; ALIGNED-NEXT: v_lshl_or_b32 v105, v36, 16, v35 +; ALIGNED-NEXT: v_lshl_or_b32 v35, v114, 8, v118 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v103, 8, v115 +; ALIGNED-NEXT: v_lshl_or_b32 v92, v36, 16, v35 +; ALIGNED-NEXT: s_clause 0x1f +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:2016 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:2017 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:2018 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:2019 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:2020 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:2021 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:2022 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:2023 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:2028 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:2029 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:2030 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:2031 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:2027 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:2024 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:2025 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:2026 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:2032 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:2033 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:2034 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:2035 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:2036 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:2037 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2038 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:2039 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:2044 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:2045 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:2046 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:2047 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:2043 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:2040 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2041 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:2042 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v99, 8, v101 +; ALIGNED-NEXT: s_clause 0x5 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1792 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:1794 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:1796 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:1797 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:1798 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:1799 +; ALIGNED-NEXT: v_lshl_or_b32 v88, v50, 16, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(55) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v87, 8, v96 +; ALIGNED-NEXT: s_waitcnt vmcnt(54) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v97, 8, v86 +; ALIGNED-NEXT: v_lshl_or_b32 v75, v50, 16, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(52) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v81, 8, v84 +; ALIGNED-NEXT: s_waitcnt vmcnt(50) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v80, 8, v82 +; ALIGNED-NEXT: v_lshl_or_b32 v57, v50, 16, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(47) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v70, 8, v83 +; ALIGNED-NEXT: s_waitcnt vmcnt(45) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v66, 8, v65 +; ALIGNED-NEXT: v_lshl_or_b32 v40, v50, 16, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(43) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v69, 8, v71 +; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v55, 8, v67 +; ALIGNED-NEXT: v_lshl_or_b32 v116, v50, 16, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(39) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v49, 8, v54 +; ALIGNED-NEXT: s_waitcnt vmcnt(38) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v68, 8, v52 +; ALIGNED-NEXT: v_lshl_or_b32 v102, v50, 16, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(36) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v38, 8, v48 +; ALIGNED-NEXT: s_waitcnt vmcnt(34) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v34, 8, v33 +; ALIGNED-NEXT: v_lshl_or_b32 v85, v50, 16, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(32) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v35, 8, v39 +; ALIGNED-NEXT: s_waitcnt vmcnt(30) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v32, 8, v37 +; ALIGNED-NEXT: v_lshl_or_b32 v64, v50, 16, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(28) +; ALIGNED-NEXT: v_lshl_or_b32 v36, v29, 8, v31 +; ALIGNED-NEXT: s_waitcnt vmcnt(26) +; ALIGNED-NEXT: v_lshl_or_b32 v50, v28, 8, v30 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v111, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v53, v50, 16, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v24, 8, v26 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v25, 8, v23 +; ALIGNED-NEXT: v_lshl_or_b32 v51, v50, 16, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v19, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v14, 8, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v27, v50, 16, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v17, 8, v20 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v13, 8, v16 +; ALIGNED-NEXT: v_lshl_or_b32 v22, v50, 16, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v9, 8, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v8, 8, v10 +; ALIGNED-NEXT: v_lshl_or_b32 v18, v50, 16, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v5, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v7, 8, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v12, v50, 16, v36 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:1793 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:1795 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v36, v36, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v50, 8, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 16, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v111, 8, v94 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v123, 8, v120 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:1806 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 16, v36 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:1805 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1804 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v117 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: s_clause 0x6 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v112, 8, v115 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v101 -; ALIGNED-NEXT: v_lshl_or_b32 v106, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v103, 8, v113 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v102 -; ALIGNED-NEXT: v_lshl_or_b32 v90, v3, 16, v2 -; ALIGNED-NEXT: s_waitcnt vmcnt(60) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97 -; ALIGNED-NEXT: s_waitcnt vmcnt(58) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96 -; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v12, 8, v16 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v8, 8, v10 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v88, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v83, 8, v84 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v82 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v73, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v68, 8, v80 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v70 -; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v71 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v53 -; ALIGNED-NEXT: v_lshl_or_b32 v116, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v55, 8, v69 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 -; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v50 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v39 -; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31 -; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34 -; ALIGNED-NEXT: v_lshl_or_b32 v64, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v27, 8, v29 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28 -; ALIGNED-NEXT: v_lshl_or_b32 v49, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v22, 8, v24 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v21 -; ALIGNED-NEXT: v_lshl_or_b32 v48, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v17, 8, v19 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v13 -; ALIGNED-NEXT: v_lshl_or_b32 v25, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v15, 8, v18 -; ALIGNED-NEXT: v_lshl_or_b32 v20, v62, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v62, v9, 8, v11 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v76, 16, v62 -; ALIGNED-NEXT: v_lshl_or_b32 v62, v5, 8, v6 -; ALIGNED-NEXT: v_lshl_or_b32 v76, v7, 8, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v76, 16, v62 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v62, v62, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v76, v76, 8, v120 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: v_lshl_or_b32 v62, v109, 8, v104 -; ALIGNED-NEXT: v_lshl_or_b32 v76, v122, 8, v110 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:1807 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:1803 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v36, 8, v3 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76 -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v50, v126, 8, v94 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 16, v36 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:1800 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:1801 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:1802 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v110, 8, v122 +; ALIGNED-NEXT: v_lshl_or_b32 v36, v120, 8, v123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v120, 8, v109 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v50, v121, 8, v111 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 16, v36 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:17 -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 -; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 -; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0xffffff00, v4 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v125, 8, v62 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v125, v76, 8, v104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v2, s4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v3, vcc_lo -; ALIGNED-NEXT: flat_store_byte v[2:3], v1 offset:250 -; ALIGNED-NEXT: flat_store_byte v[2:3], v7 offset:251 -; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:249 -; ALIGNED-NEXT: flat_store_byte v[2:3], v8 offset:255 -; ALIGNED-NEXT: flat_store_byte v[2:3], v9 offset:253 -; ALIGNED-NEXT: flat_store_byte v[2:3], v10 offset:254 -; ALIGNED-NEXT: flat_store_byte v[2:3], v11 offset:252 -; ALIGNED-NEXT: flat_store_byte v[2:3], v6 offset:248 -; ALIGNED-NEXT: flat_store_byte v[2:3], v13 offset:242 -; ALIGNED-NEXT: flat_store_byte v[2:3], v14 offset:243 -; ALIGNED-NEXT: flat_store_byte v[2:3], v17 offset:241 -; ALIGNED-NEXT: flat_store_byte v[2:3], v12 offset:247 -; ALIGNED-NEXT: flat_store_byte v[2:3], v15 offset:245 -; ALIGNED-NEXT: flat_store_byte v[2:3], v16 offset:246 -; ALIGNED-NEXT: flat_store_byte v[2:3], v18 offset:244 -; ALIGNED-NEXT: flat_store_byte v[2:3], v19 offset:240 -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:1810 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:1808 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:1809 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: flat_store_byte v[0:1], v4 offset:250 +; ALIGNED-NEXT: flat_store_byte v[0:1], v7 offset:251 +; ALIGNED-NEXT: flat_store_byte v[0:1], v5 offset:249 +; ALIGNED-NEXT: flat_store_byte v[0:1], v8 offset:255 +; ALIGNED-NEXT: flat_store_byte v[0:1], v9 offset:253 +; ALIGNED-NEXT: flat_store_byte v[0:1], v10 offset:254 +; ALIGNED-NEXT: flat_store_byte v[0:1], v11 offset:252 +; ALIGNED-NEXT: flat_store_byte v[0:1], v6 offset:248 +; ALIGNED-NEXT: flat_store_byte v[0:1], v15 offset:242 +; ALIGNED-NEXT: flat_store_byte v[0:1], v14 offset:243 +; ALIGNED-NEXT: flat_store_byte v[0:1], v19 offset:241 +; ALIGNED-NEXT: flat_store_byte v[0:1], v13 offset:247 +; ALIGNED-NEXT: flat_store_byte v[0:1], v17 offset:245 +; ALIGNED-NEXT: flat_store_byte v[0:1], v16 offset:246 +; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:244 +; ALIGNED-NEXT: flat_store_byte v[0:1], v21 offset:240 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:508 ; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte v[2:3], v21 offset:234 -; ALIGNED-NEXT: flat_store_byte v[2:3], v23 offset:235 -; ALIGNED-NEXT: flat_store_byte v[2:3], v22 offset:233 -; ALIGNED-NEXT: flat_store_byte v[2:3], v26 offset:239 -; ALIGNED-NEXT: flat_store_byte v[2:3], v27 offset:237 -; ALIGNED-NEXT: flat_store_byte v[2:3], v28 offset:238 -; ALIGNED-NEXT: flat_store_byte v[2:3], v29 offset:236 -; ALIGNED-NEXT: flat_store_byte v[2:3], v24 offset:232 -; ALIGNED-NEXT: flat_store_byte v[2:3], v31 offset:226 -; ALIGNED-NEXT: flat_store_byte v[2:3], v32 offset:227 -; ALIGNED-NEXT: flat_store_byte v[2:3], v35 offset:225 -; ALIGNED-NEXT: flat_store_byte v[2:3], v30 offset:231 -; ALIGNED-NEXT: flat_store_byte v[2:3], v33 offset:229 -; ALIGNED-NEXT: flat_store_byte v[2:3], v34 offset:230 -; ALIGNED-NEXT: flat_store_byte v[2:3], v36 offset:228 -; ALIGNED-NEXT: flat_store_byte v[2:3], v37 offset:224 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: flat_store_byte v[2:3], v68 offset:213 -; ALIGNED-NEXT: flat_store_byte v[2:3], v65 offset:215 -; ALIGNED-NEXT: flat_store_byte v[2:3], v38 offset:209 -; ALIGNED-NEXT: flat_store_byte v[2:3], v66 offset:211 -; ALIGNED-NEXT: flat_store_byte v[2:3], v39 offset:210 -; ALIGNED-NEXT: flat_store_byte v[2:3], v70 offset:214 -; ALIGNED-NEXT: flat_store_byte v[2:3], v80 offset:212 -; ALIGNED-NEXT: flat_store_byte v[2:3], v53 offset:218 -; ALIGNED-NEXT: flat_store_byte v[2:3], v52 offset:219 -; ALIGNED-NEXT: flat_store_byte v[2:3], v67 offset:217 -; ALIGNED-NEXT: flat_store_byte v[2:3], v51 offset:223 -; ALIGNED-NEXT: flat_store_byte v[2:3], v55 offset:221 -; ALIGNED-NEXT: flat_store_byte v[2:3], v54 offset:222 -; ALIGNED-NEXT: flat_store_byte v[2:3], v69 offset:220 -; ALIGNED-NEXT: flat_store_byte v[2:3], v71 offset:216 -; ALIGNED-NEXT: flat_store_byte v[2:3], v50 offset:208 -; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: flat_store_byte v[0:1], v23 offset:234 +; ALIGNED-NEXT: flat_store_byte v[0:1], v25 offset:235 +; ALIGNED-NEXT: flat_store_byte v[0:1], v24 offset:233 +; ALIGNED-NEXT: flat_store_byte v[0:1], v28 offset:239 +; ALIGNED-NEXT: flat_store_byte v[0:1], v29 offset:237 +; ALIGNED-NEXT: flat_store_byte v[0:1], v30 offset:238 +; ALIGNED-NEXT: flat_store_byte v[0:1], v31 offset:236 +; ALIGNED-NEXT: flat_store_byte v[0:1], v26 offset:232 +; ALIGNED-NEXT: flat_store_byte v[0:1], v33 offset:226 +; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:227 +; ALIGNED-NEXT: flat_store_byte v[0:1], v38 offset:225 +; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:231 +; ALIGNED-NEXT: flat_store_byte v[0:1], v35 offset:229 +; ALIGNED-NEXT: flat_store_byte v[0:1], v37 offset:230 +; ALIGNED-NEXT: flat_store_byte v[0:1], v39 offset:228 +; ALIGNED-NEXT: flat_store_byte v[0:1], v48 offset:224 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: flat_store_byte v[0:1], v49 offset:209 +; ALIGNED-NEXT: flat_store_byte v[0:1], v68 offset:211 +; ALIGNED-NEXT: flat_store_byte v[0:1], v52 offset:210 +; ALIGNED-NEXT: flat_store_byte v[0:1], v81 offset:213 +; ALIGNED-NEXT: flat_store_byte v[0:1], v80 offset:215 +; ALIGNED-NEXT: flat_store_byte v[0:1], v82 offset:214 +; ALIGNED-NEXT: flat_store_byte v[0:1], v84 offset:212 +; ALIGNED-NEXT: flat_store_byte v[0:1], v65 offset:218 +; ALIGNED-NEXT: flat_store_byte v[0:1], v66 offset:219 +; ALIGNED-NEXT: flat_store_byte v[0:1], v70 offset:217 +; ALIGNED-NEXT: flat_store_byte v[0:1], v55 offset:223 +; ALIGNED-NEXT: flat_store_byte v[0:1], v69 offset:221 +; ALIGNED-NEXT: flat_store_byte v[0:1], v67 offset:222 +; ALIGNED-NEXT: flat_store_byte v[0:1], v71 offset:220 +; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:216 +; ALIGNED-NEXT: flat_store_byte v[0:1], v54 offset:208 +; ALIGNED-NEXT: buffer_store_dword v75, off, s[0:3], s32 offset:472 ; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:476 -; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: flat_store_byte v[2:3], v82 offset:202 -; ALIGNED-NEXT: flat_store_byte v[2:3], v85 offset:203 -; ALIGNED-NEXT: flat_store_byte v[2:3], v83 offset:201 -; ALIGNED-NEXT: flat_store_byte v[2:3], v86 offset:207 -; ALIGNED-NEXT: flat_store_byte v[2:3], v87 offset:205 -; ALIGNED-NEXT: flat_store_byte v[2:3], v96 offset:206 -; ALIGNED-NEXT: flat_store_byte v[2:3], v97 offset:204 -; ALIGNED-NEXT: flat_store_byte v[2:3], v84 offset:200 -; ALIGNED-NEXT: flat_store_byte v[2:3], v101 offset:194 -; ALIGNED-NEXT: flat_store_byte v[2:3], v100 offset:195 -; ALIGNED-NEXT: flat_store_byte v[2:3], v112 offset:193 -; ALIGNED-NEXT: flat_store_byte v[2:3], v99 offset:199 -; ALIGNED-NEXT: flat_store_byte v[2:3], v103 offset:197 -; ALIGNED-NEXT: flat_store_byte v[2:3], v102 offset:198 -; ALIGNED-NEXT: flat_store_byte v[2:3], v113 offset:196 -; ALIGNED-NEXT: flat_store_byte v[2:3], v115 offset:192 -; ALIGNED-NEXT: v_lshl_or_b32 v125, v0, 16, v125 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 -; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 -; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 -; ALIGNED-NEXT: flat_store_byte v[2:3], v117 offset:186 -; ALIGNED-NEXT: flat_store_byte v[2:3], v40 offset:187 -; ALIGNED-NEXT: flat_store_byte v[2:3], v118 offset:185 -; ALIGNED-NEXT: flat_store_byte v[2:3], v41 offset:191 -; ALIGNED-NEXT: flat_store_byte v[2:3], v43 offset:189 -; ALIGNED-NEXT: flat_store_byte v[2:3], v42 offset:190 -; ALIGNED-NEXT: flat_store_byte v[2:3], v44 offset:188 -; ALIGNED-NEXT: flat_store_byte v[2:3], v119 offset:184 -; ALIGNED-NEXT: flat_store_byte v[2:3], v47 offset:178 -; ALIGNED-NEXT: flat_store_byte v[2:3], v56 offset:179 -; ALIGNED-NEXT: flat_store_byte v[2:3], v59 offset:177 -; ALIGNED-NEXT: flat_store_byte v[2:3], v46 offset:183 -; ALIGNED-NEXT: flat_store_byte v[2:3], v57 offset:181 -; ALIGNED-NEXT: flat_store_byte v[2:3], v58 offset:182 -; ALIGNED-NEXT: flat_store_byte v[2:3], v60 offset:180 -; ALIGNED-NEXT: flat_store_byte v[2:3], v61 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 -; ALIGNED-NEXT: flat_store_byte v[2:3], v63 offset:170 -; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:171 -; ALIGNED-NEXT: flat_store_byte v[2:3], v72 offset:169 -; ALIGNED-NEXT: flat_store_byte v[2:3], v77 offset:175 -; ALIGNED-NEXT: flat_store_byte v[2:3], v79 offset:173 -; ALIGNED-NEXT: flat_store_byte v[2:3], v78 offset:174 -; ALIGNED-NEXT: flat_store_byte v[2:3], v89 offset:172 -; ALIGNED-NEXT: flat_store_byte v[2:3], v74 offset:168 -; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:162 -; ALIGNED-NEXT: flat_store_byte v[2:3], v92 offset:163 -; ALIGNED-NEXT: flat_store_byte v[2:3], v105 offset:161 -; ALIGNED-NEXT: flat_store_byte v[2:3], v91 offset:167 -; ALIGNED-NEXT: flat_store_byte v[2:3], v95 offset:165 -; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:166 -; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:164 -; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:160 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_store_dword v105, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: flat_store_byte v[0:1], v86 offset:202 +; ALIGNED-NEXT: flat_store_byte v[0:1], v97 offset:203 +; ALIGNED-NEXT: flat_store_byte v[0:1], v87 offset:201 +; ALIGNED-NEXT: flat_store_byte v[0:1], v98 offset:207 +; ALIGNED-NEXT: flat_store_byte v[0:1], v99 offset:205 +; ALIGNED-NEXT: flat_store_byte v[0:1], v100 offset:206 +; ALIGNED-NEXT: flat_store_byte v[0:1], v101 offset:204 +; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:200 +; ALIGNED-NEXT: flat_store_byte v[0:1], v112 offset:194 +; ALIGNED-NEXT: flat_store_byte v[0:1], v113 offset:195 +; ALIGNED-NEXT: flat_store_byte v[0:1], v117 offset:193 +; ALIGNED-NEXT: flat_store_byte v[0:1], v103 offset:199 +; ALIGNED-NEXT: flat_store_byte v[0:1], v114 offset:197 +; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:198 +; ALIGNED-NEXT: flat_store_byte v[0:1], v118 offset:196 +; ALIGNED-NEXT: flat_store_byte v[0:1], v119 offset:192 +; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v125, 8, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v125, v50, 8, v94 +; ALIGNED-NEXT: v_lshl_or_b32 v125, v3, 16, v125 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: flat_store_byte v[0:1], v41 offset:186 +; ALIGNED-NEXT: flat_store_byte v[0:1], v43 offset:187 +; ALIGNED-NEXT: flat_store_byte v[0:1], v42 offset:185 +; ALIGNED-NEXT: flat_store_byte v[0:1], v45 offset:191 +; ALIGNED-NEXT: flat_store_byte v[0:1], v46 offset:189 +; ALIGNED-NEXT: flat_store_byte v[0:1], v47 offset:190 +; ALIGNED-NEXT: flat_store_byte v[0:1], v56 offset:188 +; ALIGNED-NEXT: flat_store_byte v[0:1], v44 offset:184 +; ALIGNED-NEXT: flat_store_byte v[0:1], v60 offset:178 +; ALIGNED-NEXT: flat_store_byte v[0:1], v59 offset:179 +; ALIGNED-NEXT: flat_store_byte v[0:1], v63 offset:177 +; ALIGNED-NEXT: flat_store_byte v[0:1], v58 offset:183 +; ALIGNED-NEXT: flat_store_byte v[0:1], v62 offset:181 +; ALIGNED-NEXT: flat_store_byte v[0:1], v61 offset:182 +; ALIGNED-NEXT: flat_store_byte v[0:1], v72 offset:180 +; ALIGNED-NEXT: flat_store_byte v[0:1], v73 offset:176 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:572 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:564 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:560 +; ALIGNED-NEXT: flat_store_byte v[0:1], v74 offset:170 +; ALIGNED-NEXT: flat_store_byte v[0:1], v77 offset:171 +; ALIGNED-NEXT: flat_store_byte v[0:1], v76 offset:169 +; ALIGNED-NEXT: flat_store_byte v[0:1], v79 offset:175 +; ALIGNED-NEXT: flat_store_byte v[0:1], v90 offset:173 +; ALIGNED-NEXT: flat_store_byte v[0:1], v89 offset:174 +; ALIGNED-NEXT: flat_store_byte v[0:1], v91 offset:172 +; ALIGNED-NEXT: flat_store_byte v[0:1], v78 offset:168 +; ALIGNED-NEXT: flat_store_byte v[0:1], v95 offset:162 +; ALIGNED-NEXT: flat_store_byte v[0:1], v104 offset:163 +; ALIGNED-NEXT: flat_store_byte v[0:1], v108 offset:161 +; ALIGNED-NEXT: flat_store_byte v[0:1], v93 offset:167 +; ALIGNED-NEXT: flat_store_byte v[0:1], v107 offset:165 +; ALIGNED-NEXT: flat_store_byte v[0:1], v106 offset:166 +; ALIGNED-NEXT: flat_store_byte v[0:1], v109 offset:164 +; ALIGNED-NEXT: flat_store_byte v[0:1], v110 offset:160 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v122 offset:154 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:155 +; ALIGNED-NEXT: flat_store_byte v[0:1], v124 offset:153 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:154 -; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:153 -; ALIGNED-NEXT: flat_store_byte v[2:3], v126 offset:159 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:159 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:157 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:157 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:158 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:158 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:156 -; ALIGNED-NEXT: flat_store_byte v[2:3], v123 offset:152 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:156 +; ALIGNED-NEXT: flat_store_byte v[0:1], v127 offset:152 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:146 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:146 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:147 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:147 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:145 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:145 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:151 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:151 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:149 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:149 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:150 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:148 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:148 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:144 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:144 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:138 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:138 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:139 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:139 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:137 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:137 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:143 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:143 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:141 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:141 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:142 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:142 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:140 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:140 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:136 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:136 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:130 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:130 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:131 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:131 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:129 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:129 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:135 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:135 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:133 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:133 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:134 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:134 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:132 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:132 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:128 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:128 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:616 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:612 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:608 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:122 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:122 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1232 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:123 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:123 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:121 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:121 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:127 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:127 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:125 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:125 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:126 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:126 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:124 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:124 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:120 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:120 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:114 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:114 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:115 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:115 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:113 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:113 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:119 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:119 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:117 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:117 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:118 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:118 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:116 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:116 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:112 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:112 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:632 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:636 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:624 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:106 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:106 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:107 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:107 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:105 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:105 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:111 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:111 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:109 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:109 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:110 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:110 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:108 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:108 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:104 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:104 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:98 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:98 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:99 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:99 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:97 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:97 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:103 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:103 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:101 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:101 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:102 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:102 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:100 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:100 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:96 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:96 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:584 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:576 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:90 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:90 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:91 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:91 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:89 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:89 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:95 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:95 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:93 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:93 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:94 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:94 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:92 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:92 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:88 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:88 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:82 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:82 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:83 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:83 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:81 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:81 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:87 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:87 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:85 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:85 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:86 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:86 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:84 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:84 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:80 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:80 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:604 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:596 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:74 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:74 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:75 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:75 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:73 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:73 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:79 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:79 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:77 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:77 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:78 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:78 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:76 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:76 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:72 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:72 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:66 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:66 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:67 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:67 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:65 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:65 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:71 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:71 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:69 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:69 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:70 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:70 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:68 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:68 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:64 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:64 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:680 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:684 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:676 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:672 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:61 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:61 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:58 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:58 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:59 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:59 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:57 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:57 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:63 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:63 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:62 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:62 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:60 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:60 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:56 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:56 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:53 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:53 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:50 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:50 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:51 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:51 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:49 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:49 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:55 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:55 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:54 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:54 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:52 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:52 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:48 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:48 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:700 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:696 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:692 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:688 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:43 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:43 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:42 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:42 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:41 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:41 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:40 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:40 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:47 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:47 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:46 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:46 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:45 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:45 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:44 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:44 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:35 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:35 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:34 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:34 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:33 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:33 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:32 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:32 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:39 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:39 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:38 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:38 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:37 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:37 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:36 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:36 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:648 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:652 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:644 ; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:640 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:26 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:26 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:27 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:27 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:25 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:25 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:31 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:31 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:29 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:29 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:30 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:30 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:28 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:28 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[2:3], v62 offset:18 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:24 +; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:18 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[2:3], v76 offset:17 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:19 +; ALIGNED-NEXT: flat_store_byte v[0:1], v50 offset:17 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:23 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:23 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:21 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:21 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:22 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:22 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[2:3], v104 offset:16 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:20 +; ALIGNED-NEXT: flat_store_byte v[0:1], v94 offset:16 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:664 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:668 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:660 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 -; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:10 -; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:11 -; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:13 -; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:9 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:656 +; ALIGNED-NEXT: flat_store_byte v[0:1], v111 offset:10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v121 offset:11 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:15 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:13 +; ALIGNED-NEXT: flat_store_byte v[0:1], v120 offset:9 +; ALIGNED-NEXT: flat_store_byte v[0:1], v126 offset:15 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:14 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:14 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:12 -; ALIGNED-NEXT: flat_store_byte v[2:3], v122 offset:8 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:12 +; ALIGNED-NEXT: flat_store_byte v[0:1], v123 offset:8 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:2 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:2 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:3 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:3 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:1 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:1 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:7 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:7 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:5 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:5 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:6 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:6 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:4 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 offset:4 +; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 -; ALIGNED-NEXT: s_cbranch_scc0 .LBB9_4 -; ALIGNED-NEXT: .LBB9_5: ; %Flow11 -; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; ALIGNED-NEXT: flat_store_byte v[0:1], v3 +; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffff00, v0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; ALIGNED-NEXT: s_cbranch_scc0 .LBB9_5 +; ALIGNED-NEXT: .LBB9_6: ; %Flow17 +; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; ALIGNED-NEXT: s_clause 0x2f ; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 @@ -15844,42 +15892,44 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3: ; %bb.0: ; %entry ; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNROLL3-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; UNROLL3-NEXT: s_mov_b64 s[4:5], 0 -; UNROLL3-NEXT: s_mov_b32 s6, exec_lo +; UNROLL3-NEXT: s_mov_b32 s4, exec_lo ; UNROLL3-NEXT: v_cndmask_b32_e32 v3, -1, v0, vcc_lo ; UNROLL3-NEXT: v_cmpx_ge_u32_e64 v2, v3 -; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s6 +; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s4 ; UNROLL3-NEXT: s_cbranch_execz .LBB9_4 ; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader -; UNROLL3-NEXT: v_mov_b32_e32 v3, v2 +; UNROLL3-NEXT: v_mov_b32_e32 v4, v1 +; UNROLL3-NEXT: v_mov_b32_e32 v3, v0 +; UNROLL3-NEXT: v_mov_b32_e32 v5, v2 +; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7e0 ; UNROLL3-NEXT: s_inst_prefetch 0x1 ; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB9_2: ; %memmove_fwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: s_clause 0xb -; UNROLL3-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen -; UNROLL3-NEXT: buffer_load_dword v5, v3, s[0:3], 0 offen offset:4 -; UNROLL3-NEXT: buffer_load_dword v6, v3, s[0:3], 0 offen offset:8 -; UNROLL3-NEXT: buffer_load_dword v7, v3, s[0:3], 0 offen offset:12 -; UNROLL3-NEXT: buffer_load_dword v8, v3, s[0:3], 0 offen offset:16 -; UNROLL3-NEXT: buffer_load_dword v9, v3, s[0:3], 0 offen offset:20 -; UNROLL3-NEXT: buffer_load_dword v10, v3, s[0:3], 0 offen offset:24 -; UNROLL3-NEXT: buffer_load_dword v11, v3, s[0:3], 0 offen offset:28 -; UNROLL3-NEXT: buffer_load_dword v12, v3, s[0:3], 0 offen offset:32 -; UNROLL3-NEXT: buffer_load_dword v13, v3, s[0:3], 0 offen offset:36 -; UNROLL3-NEXT: buffer_load_dword v14, v3, s[0:3], 0 offen offset:40 -; UNROLL3-NEXT: buffer_load_dword v15, v3, s[0:3], 0 offen offset:44 -; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo -; UNROLL3-NEXT: s_add_u32 s4, s4, 48 -; UNROLL3-NEXT: v_add_nc_u32_e32 v3, 48, v3 -; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 +; UNROLL3-NEXT: buffer_load_dword v6, v5, s[0:3], 0 offen +; UNROLL3-NEXT: buffer_load_dword v7, v5, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: buffer_load_dword v8, v5, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: buffer_load_dword v9, v5, s[0:3], 0 offen offset:12 +; UNROLL3-NEXT: buffer_load_dword v10, v5, s[0:3], 0 offen offset:16 +; UNROLL3-NEXT: buffer_load_dword v11, v5, s[0:3], 0 offen offset:20 +; UNROLL3-NEXT: buffer_load_dword v12, v5, s[0:3], 0 offen offset:24 +; UNROLL3-NEXT: buffer_load_dword v13, v5, s[0:3], 0 offen offset:28 +; UNROLL3-NEXT: buffer_load_dword v14, v5, s[0:3], 0 offen offset:32 +; UNROLL3-NEXT: buffer_load_dword v15, v5, s[0:3], 0 offen offset:36 +; UNROLL3-NEXT: buffer_load_dword v16, v5, s[0:3], 0 offen offset:40 +; UNROLL3-NEXT: buffer_load_dword v17, v5, s[0:3], 0 offen offset:44 +; UNROLL3-NEXT: v_add_nc_u32_e32 v5, 48, v5 +; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 +; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 ; UNROLL3-NEXT: s_waitcnt vmcnt(4) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] offset:16 -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[10:13] offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[6:9] ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32 -; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0x7e0 +; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[14:17] offset:32 +; UNROLL3-NEXT: v_add_co_u32 v3, vcc_lo, v3, 48 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v4, vcc_lo +; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc1 .LBB9_2 ; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual ; UNROLL3-NEXT: s_inst_prefetch 0x2 @@ -15898,9 +15948,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: ; implicit-def: $vgpr2 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:2032 -; UNROLL3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; UNROLL3-NEXT: .LBB9_4: ; %Flow8 -; UNROLL3-NEXT: s_andn2_saveexec_b32 s8, s6 +; UNROLL3-NEXT: ; implicit-def: $vgpr0 +; UNROLL3-NEXT: .LBB9_4: ; %Flow14 +; UNROLL3-NEXT: s_andn2_saveexec_b32 s6, s6 ; UNROLL3-NEXT: s_cbranch_execz .LBB9_7 ; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual ; UNROLL3-NEXT: s_clause 0x3 @@ -15908,51 +15958,51 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:2036 ; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:2040 ; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:2044 -; UNROLL3-NEXT: s_movk_i32 s6, 0xffd0 -; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7b0 -; UNROLL3-NEXT: s_mov_b32 s7, -1 +; UNROLL3-NEXT: s_movk_i32 s4, 0xf820 +; UNROLL3-NEXT: s_mov_b32 s5, -1 ; UNROLL3-NEXT: s_waitcnt vmcnt(0) ; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:2032 ; UNROLL3-NEXT: s_clause 0x3 -; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:2016 -; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:2020 -; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:2024 -; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:2028 -; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v2 +; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:2016 +; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:2020 +; UNROLL3-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:2024 +; UNROLL3-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:2028 +; UNROLL3-NEXT: v_add_co_u32 v3, vcc_lo, 0x7b0, v0 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:2016 +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[5:8] offset:2016 ; UNROLL3-NEXT: s_inst_prefetch 0x1 ; UNROLL3-NEXT: .p2align 6 ; UNROLL3-NEXT: .LBB9_6: ; %memmove_bwd_loop ; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1 ; UNROLL3-NEXT: s_clause 0xb -; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen -; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 -; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 -; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; UNROLL3-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 -; UNROLL3-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 -; UNROLL3-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 -; UNROLL3-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 -; UNROLL3-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:32 -; UNROLL3-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:36 -; UNROLL3-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:40 -; UNROLL3-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:44 -; UNROLL3-NEXT: v_add_co_u32 v15, vcc_lo, v0, s4 -; UNROLL3-NEXT: v_add_co_ci_u32_e64 v16, null, s5, v1, vcc_lo +; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:1968 +; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:1972 +; UNROLL3-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:1976 +; UNROLL3-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:1980 +; UNROLL3-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:1984 +; UNROLL3-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:1988 +; UNROLL3-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:1992 +; UNROLL3-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:1996 +; UNROLL3-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:2000 +; UNROLL3-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:2004 +; UNROLL3-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:2008 +; UNROLL3-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:2012 ; UNROLL3-NEXT: v_subrev_nc_u32_e32 v2, 48, v2 -; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0 -; UNROLL3-NEXT: s_addc_u32 s5, s5, -1 +; UNROLL3-NEXT: s_add_u32 s4, s4, 48 +; UNROLL3-NEXT: s_addc_u32 s5, s5, 0 ; UNROLL3-NEXT: s_waitcnt vmcnt(4) -; UNROLL3-NEXT: flat_store_dwordx4 v[15:16], v[7:10] offset:16 -; UNROLL3-NEXT: flat_store_dwordx4 v[15:16], v[3:6] +; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[9:12] offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[5:8] ; UNROLL3-NEXT: s_waitcnt vmcnt(0) -; UNROLL3-NEXT: flat_store_dwordx4 v[15:16], v[11:14] offset:32 -; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[13:16] offset:32 +; UNROLL3-NEXT: v_add_co_u32 v3, vcc_lo, 0xffffffd0, v3 +; UNROLL3-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v4, vcc_lo +; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0 ; UNROLL3-NEXT: s_cbranch_scc0 .LBB9_6 -; UNROLL3-NEXT: .LBB9_7: ; %Flow9 +; UNROLL3-NEXT: .LBB9_7: ; %Flow15 ; UNROLL3-NEXT: s_inst_prefetch 0x2 -; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) ; UNROLL3-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-invalid-ptr-extend.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-invalid-ptr-extend.ll index 61c1fd6fbb198..0a7102a14a83a 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-invalid-ptr-extend.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-invalid-ptr-extend.ll @@ -12,22 +12,20 @@ define amdgpu_kernel void @scaledregtest() local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: loopexit: -; CHECK-NEXT: [[SCEVGEP11_LCSSA:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP11:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SCEVGEP13_LCSSA:%.*]] = phi ptr [ [[SCEVGEP13:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: br label [[FOR_BODY_1:%.*]] ; CHECK: for.body.1: -; CHECK-NEXT: [[LSR_IV5:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP6:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP11_LCSSA]], [[LOOPEXIT:%.*]] ] -; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP13_LCSSA]], [[LOOPEXIT]] ] +; CHECK-NEXT: [[LSR_IV5:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP6:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP11_LCSSA:%.*]], [[LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP13_LCSSA:%.*]], [[LOOPEXIT]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[LSR_IV5]], align 8 ; CHECK-NEXT: store ptr [[TMP0]], ptr [[LSR_IV1]], align 8 ; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 8 ; CHECK-NEXT: [[SCEVGEP6]] = getelementptr i8, ptr addrspace(5) [[LSR_IV5]], i32 8 ; CHECK-NEXT: br label [[FOR_BODY_1]] ; CHECK: for.body: -; CHECK-NEXT: [[LSR_IV12:%.*]] = phi ptr [ [[SCEVGEP13]], [[FOR_BODY]] ], [ null, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[LSR_IV10:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP11]], [[FOR_BODY]] ], [ null, [[ENTRY]] ] -; CHECK-NEXT: [[SCEVGEP11]] = getelementptr i8, ptr addrspace(5) [[LSR_IV10]], i32 64 -; CHECK-NEXT: [[SCEVGEP13]] = getelementptr i8, ptr [[LSR_IV12]], i64 64 +; CHECK-NEXT: [[SCEVGEP11_LCSSA]] = phi ptr addrspace(5) [ [[SCEVGEP4:%.*]], [[FOR_BODY]] ], [ inttoptr (i32 64 to ptr addrspace(5)), [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SCEVGEP13_LCSSA]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ inttoptr (i64 64 to ptr), [[ENTRY]] ] +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[SCEVGEP13_LCSSA]], i64 64 +; CHECK-NEXT: [[SCEVGEP4]] = getelementptr i8, ptr addrspace(5) [[SCEVGEP11_LCSSA]], i32 64 ; CHECK-NEXT: br i1 false, label [[LOOPEXIT]], label [[FOR_BODY]] ; entry: @@ -58,7 +56,7 @@ for.body: define protected amdgpu_kernel void @baseregtest(i32 %n, i32 %lda, i1 %arg) local_unnamed_addr { ; CHECK-LABEL: @baseregtest( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 %arg, label [[EXIT:%.*]], label [[IF_END:%.*]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[EXIT:%.*]], label [[IF_END:%.*]] ; CHECK: if.end: ; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @foo() ; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 3