Skip to content

Commit 80e2792

Browse files
committed
Merge commit '7fe60a768581316e229633d014f52d390546def9' into llvmspirv_pulldown
2 parents b761dc4 + 7fe60a7 commit 80e2792

35 files changed

+459
-574
lines changed

clang/include/clang/Basic/BuiltinsX86.td

Lines changed: 24 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -93,22 +93,6 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
9393
}
9494

9595

96-
let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
97-
def pavgb128 : X86Builtin<"_Vector<16, unsigned char>(_Vector<16, unsigned char>, _Vector<16, unsigned char>)">;
98-
def pavgw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
99-
def pmulhw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
100-
def pmulhuw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
101-
def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
102-
def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
103-
def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
104-
105-
def vec_ext_v2di : X86Builtin<"long long int(_Vector<2, long long int>, _Constant int)">;
106-
def vec_ext_v4si : X86Builtin<"int(_Vector<4, int>, _Constant int)">;
107-
def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">;
108-
def vec_ext_v8hi : X86Builtin<"short(_Vector<8, short>, _Constant int)">;
109-
def vec_set_v8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, short, _Constant int)">;
110-
}
111-
11296
let Features = "sse3" in {
11397
foreach Op = ["addsub"] in {
11498
def Op#ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>)">;
@@ -219,15 +203,6 @@ let Features = "sse2", Attributes = [NoThrow] in {
219203
def movnti : X86Builtin<"void(int *, int)">;
220204
}
221205

222-
let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
223-
def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
224-
def pshufd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
225-
def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
226-
def movmskpd : X86Builtin<"int(_Vector<2, double>)">;
227-
def pmovmskb128 : X86Builtin<"int(_Vector<16, char>)">;
228-
def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
229-
}
230-
231206
let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
232207
def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
233208
def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
@@ -285,12 +260,27 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
285260
def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
286261
}
287262

288-
let Features = "sse2",
289-
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
263+
let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
264+
def movmskpd : X86Builtin<"int(_Vector<2, double>)">;
265+
def pmovmskb128 : X86Builtin<"int(_Vector<16, char>)">;
266+
267+
def pavgb128 : X86Builtin<"_Vector<16, unsigned char>(_Vector<16, unsigned char>, _Vector<16, unsigned char>)">;
268+
def pavgw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
269+
290270
def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">;
291-
271+
def pmulhw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
272+
def pmulhuw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
292273
def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
293274

275+
def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
276+
def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
277+
def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
278+
279+
def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
280+
def pshufd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
281+
def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
282+
def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
283+
294284
def psllwi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">;
295285
def pslldi128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int)">;
296286
def psllqi128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, int)">;
@@ -304,6 +294,12 @@ let Features = "sse2",
304294

305295
def pslldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
306296
def psrldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
297+
298+
def vec_ext_v2di : X86Builtin<"long long int(_Vector<2, long long int>, _Constant int)">;
299+
def vec_ext_v4si : X86Builtin<"int(_Vector<4, int>, _Constant int)">;
300+
def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">;
301+
def vec_ext_v8hi : X86Builtin<"short(_Vector<8, short>, _Constant int)">;
302+
def vec_set_v8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, short, _Constant int)">;
307303
}
308304

309305
let Features = "sse3", Attributes = [NoThrow] in {

flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,7 @@ namespace {
4141
static bool isAssumedSize(mlir::ValueRange shape) {
4242
if (shape.size() != 1)
4343
return false;
44-
std::optional<std::int64_t> val = fir::getIntIfConstant(shape[0]);
45-
if (val && *val == -1)
44+
if (llvm::isa_and_nonnull<fir::AssumedSizeExtentOp>(shape[0].getDefiningOp()))
4645
return true;
4746
return false;
4847
}

flang/test/Fir/CUDA/cuda-shared-offset.mlir

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
44
gpu.module @cuda_device_mod {
55
gpu.func @_QPdynshared() kernel {
6-
%c-1 = arith.constant -1 : index
7-
%6 = cuf.shared_memory !fir.array<?xf32>, %c-1 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
8-
%7 = fir.shape %c-1 : (index) -> !fir.shape<1>
6+
%0 = fir.assumed_size_extent : index
7+
%6 = cuf.shared_memory !fir.array<?xf32>, %0 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
8+
%7 = fir.shape %0 : (index) -> !fir.shape<1>
99
%8 = fir.declare %6(%7) {data_attr = #cuf.cuda<shared>, uniq_name = "_QFdynsharedEr"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xf32>>
1010
gpu.return
1111
}
@@ -14,7 +14,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
1414

1515
// CHECK-LABEL: gpu.module @cuda_device_mod
1616
// CHECK: gpu.func @_QPdynshared()
17-
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %c-1 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
17+
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %{{.*}} : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
1818
// CHECK: gpu.return
1919
// CHECK: }
2020
// CHECK: fir.global external @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
@@ -127,16 +127,16 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
127127
gpu.module @cuda_device_mod {
128128
gpu.func @_QMmtestsPtestany(%arg0: !fir.ref<!fir.array<?xf32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
129129
%0 = fir.dummy_scope : !fir.dscope
130-
%c-1 = arith.constant -1 : index
131-
%1 = fir.shape %c-1 : (index) -> !fir.shape<1>
130+
%a0 = fir.assumed_size_extent : index
131+
%1 = fir.shape %a0 : (index) -> !fir.shape<1>
132132
%2:2 = hlfir.declare %arg0(%1) dummy_scope %0 {data_attr = #cuf.cuda<device>, uniq_name = "_QMmtestsFtestanyEa"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
133133
%3 = fir.address_of(@_QM__fortran_builtinsE__builtin_blockdim) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
134134
%4:2 = hlfir.declare %3 {uniq_name = "_QM__fortran_builtinsE__builtin_blockdim"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>)
135135
%5 = fir.address_of(@_QM__fortran_builtinsE__builtin_blockidx) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
136136
%6:2 = hlfir.declare %5 {uniq_name = "_QM__fortran_builtinsE__builtin_blockidx"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>)
137-
%c-1_0 = arith.constant -1 : index
138-
%7 = cuf.shared_memory !fir.array<?xf64>, %c-1_0 : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>>
139-
%8 = fir.shape %c-1_0 : (index) -> !fir.shape<1>
137+
%a2 = fir.assumed_size_extent : index
138+
%7 = cuf.shared_memory !fir.array<?xf64>, %a2 : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>>
139+
%8 = fir.shape %a2 : (index) -> !fir.shape<1>
140140
%9:2 = hlfir.declare %7(%8) {data_attr = #cuf.cuda<shared>, uniq_name = "_QMmtestsFtestanyEdmasks"} : (!fir.ref<!fir.array<?xf64>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>)
141141
%10 = fir.address_of(@_QM__fortran_builtinsE__builtin_griddim) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
142142
%11:2 = hlfir.declare %10 {uniq_name = "_QM__fortran_builtinsE__builtin_griddim"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>)
@@ -146,17 +146,17 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
146146
%15:2 = hlfir.declare %14 {uniq_name = "_QMmtestsFtestanyEiam"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
147147
%16 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QMmtestsFtestanyEj"}
148148
%17:2 = hlfir.declare %16 {uniq_name = "_QMmtestsFtestanyEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
149-
%c-1_1 = arith.constant -1 : index
150-
%18 = cuf.shared_memory !fir.array<?xf32>, %c-1_1 : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>
151-
%19 = fir.shape %c-1_1 : (index) -> !fir.shape<1>
149+
%a3 = fir.assumed_size_extent : index
150+
%18 = cuf.shared_memory !fir.array<?xf32>, %a3 : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>
151+
%19 = fir.shape %a3 : (index) -> !fir.shape<1>
152152
%20:2 = hlfir.declare %18(%19) {data_attr = #cuf.cuda<shared>, uniq_name = "_QMmtestsFtestanyEsmasks"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
153153
gpu.return
154154
}
155155
}
156156
}
157157

158158
// CHECK-LABEL: gpu.func @_QMmtestsPtestany
159-
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf64>, %c-1{{.*}} : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>>
160-
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %c-1{{.*}} : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>
159+
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf64>, %{{.*}} : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>>
160+
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %{{.*}} : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>
161161

162162
// CHECK: fir.global external @_QMmtestsPtestany__shared_mem {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>

llvm/docs/SandboxIR.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ Within your LLVM pass:
88

99
``` C++
1010
// 1. Include the necessary Sandbox IR header files.
11-
#include "llvm/SandboxIR/Context.h
12-
#include "llvm/SandboxIR/Function.h
11+
#include "llvm/SandboxIR/Context.h"
12+
#include "llvm/SandboxIR/Function.h"
1313

1414
// 2. Create a sandboxir::Context using LLVMContext `LLVMCtx`.
1515
sandboxir::Context Ctx(LLVMCtx);

llvm/include/llvm/CodeGen/MachineBasicBlock.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ class MachineBasicBlock
129129
MCRegister PhysReg;
130130
LaneBitmask LaneMask;
131131

132-
RegisterMaskPair(MCPhysReg PhysReg, LaneBitmask LaneMask)
132+
RegisterMaskPair(MCRegister PhysReg, LaneBitmask LaneMask)
133133
: PhysReg(PhysReg), LaneMask(LaneMask) {}
134134

135135
bool operator==(const RegisterMaskPair &other) const {

llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp

Lines changed: 73 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1123,24 +1123,85 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
11231123
}
11241124
}
11251125

1126-
// FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register allocation
1127-
// where a consecutive multi-vector tuple is constructed from the same indices
1128-
// of multiple strided loads. This may still result in unnecessary copies
1129-
// between the loads and the tuple. Here we try to return a hint to assign the
1130-
// contiguous ZPRMulReg starting at the same register as the first operand of
1131-
// the pseudo, which should be a subregister of the first strided load.
1126+
// We add regalloc hints for different cases:
1127+
// * Choosing a better destination operand for predicated SVE instructions
1128+
// where the inactive lanes are undef, by choosing a register that is not
1129+
// unique to the other operands of the instruction.
11321130
//
1133-
// For example, if the first strided load has been assigned $z16_z20_z24_z28
1134-
// and the operands of the pseudo are each accessing subregister zsub2, we
1135-
// should look through through Order to find a contiguous register which
1136-
// begins with $z24 (i.e. $z24_z25_z26_z27).
1131+
// * Improve register allocation for SME multi-vector instructions where we can
1132+
// benefit from the strided- and contiguous register multi-vector tuples.
11371133
//
1134+
// Here FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register
1135+
// allocation where a consecutive multi-vector tuple is constructed from the
1136+
// same indices of multiple strided loads. This may still result in
1137+
// unnecessary copies between the loads and the tuple. Here we try to return a
1138+
// hint to assign the contiguous ZPRMulReg starting at the same register as
1139+
// the first operand of the pseudo, which should be a subregister of the first
1140+
// strided load.
1141+
//
1142+
// For example, if the first strided load has been assigned $z16_z20_z24_z28
1143+
// and the operands of the pseudo are each accessing subregister zsub2, we
1144+
// should look through through Order to find a contiguous register which
1145+
// begins with $z24 (i.e. $z24_z25_z26_z27).
11381146
bool AArch64RegisterInfo::getRegAllocationHints(
11391147
Register VirtReg, ArrayRef<MCPhysReg> Order,
11401148
SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
11411149
const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
1142-
11431150
auto &ST = MF.getSubtarget<AArch64Subtarget>();
1151+
const AArch64InstrInfo *TII =
1152+
MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
1153+
const MachineRegisterInfo &MRI = MF.getRegInfo();
1154+
1155+
// For predicated SVE instructions where the inactive lanes are undef,
1156+
// pick a destination register that is not unique to avoid introducing
1157+
// a movprfx.
1158+
const TargetRegisterClass *RegRC = MRI.getRegClass(VirtReg);
1159+
if (AArch64::ZPRRegClass.hasSubClassEq(RegRC)) {
1160+
for (const MachineOperand &DefOp : MRI.def_operands(VirtReg)) {
1161+
const MachineInstr &Def = *DefOp.getParent();
1162+
if (DefOp.isImplicit() ||
1163+
(TII->get(Def.getOpcode()).TSFlags & AArch64::FalseLanesMask) !=
1164+
AArch64::FalseLanesUndef)
1165+
continue;
1166+
1167+
unsigned InstFlags =
1168+
TII->get(AArch64::getSVEPseudoMap(Def.getOpcode())).TSFlags;
1169+
1170+
for (MCPhysReg R : Order) {
1171+
auto AddHintIfSuitable = [&](MCPhysReg R, const MachineOperand &MO) {
1172+
// R is a suitable register hint if there exists an operand for the
1173+
// instruction that is not yet allocated a register or if R matches
1174+
// one of the other source operands.
1175+
if (!VRM->hasPhys(MO.getReg()) || VRM->getPhys(MO.getReg()) == R)
1176+
Hints.push_back(R);
1177+
};
1178+
1179+
switch (InstFlags & AArch64::DestructiveInstTypeMask) {
1180+
default:
1181+
break;
1182+
case AArch64::DestructiveTernaryCommWithRev:
1183+
AddHintIfSuitable(R, Def.getOperand(2));
1184+
AddHintIfSuitable(R, Def.getOperand(3));
1185+
AddHintIfSuitable(R, Def.getOperand(4));
1186+
break;
1187+
case AArch64::DestructiveBinaryComm:
1188+
case AArch64::DestructiveBinaryCommWithRev:
1189+
AddHintIfSuitable(R, Def.getOperand(2));
1190+
AddHintIfSuitable(R, Def.getOperand(3));
1191+
break;
1192+
case AArch64::DestructiveBinary:
1193+
case AArch64::DestructiveBinaryImm:
1194+
AddHintIfSuitable(R, Def.getOperand(2));
1195+
break;
1196+
}
1197+
}
1198+
}
1199+
1200+
if (Hints.size())
1201+
return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
1202+
MF, VRM);
1203+
}
1204+
11441205
if (!ST.hasSME() || !ST.isStreaming())
11451206
return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
11461207
VRM);
@@ -1153,8 +1214,7 @@ bool AArch64RegisterInfo::getRegAllocationHints(
11531214
// FORM_TRANSPOSED_REG_TUPLE pseudo, we want to favour reducing copy
11541215
// instructions over reducing the number of clobbered callee-save registers,
11551216
// so we add the strided registers as a hint.
1156-
const MachineRegisterInfo &MRI = MF.getRegInfo();
1157-
unsigned RegID = MRI.getRegClass(VirtReg)->getID();
1217+
unsigned RegID = RegRC->getID();
11581218
if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID ||
11591219
RegID == AArch64::ZPR4StridedOrContiguousRegClassID) {
11601220

llvm/test/CodeGen/AArch64/aarch64-combine-add-sub-mul.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,11 @@ define <2 x i64> @test_mul_sub_2x64_2(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c,
5252
; CHECK-NEXT: ptrue p0.d, vl2
5353
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
5454
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
55-
; CHECK-NEXT: // kill: def $q3 killed $q3 def $z3
5655
; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2
56+
; CHECK-NEXT: // kill: def $q3 killed $q3 def $z3
5757
; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
58-
; CHECK-NEXT: movprfx z1, z2
59-
; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d
60-
; CHECK-NEXT: sub v0.2d, v1.2d, v0.2d
58+
; CHECK-NEXT: mul z2.d, p0/m, z2.d, z3.d
59+
; CHECK-NEXT: sub v0.2d, v2.2d, v0.2d
6160
; CHECK-NEXT: ret
6261
%div = sdiv <2 x i64> %a, %b
6362
%mul = mul <2 x i64> %c, %d

0 commit comments

Comments
 (0)