Skip to content

Commit cbb9b0e

Browse files
authored
[AArch64] Lower v1i64 and v2i64 [S|U][MIN|MAX] to SVE when available (#166735)
The predicate is likely to be hoisted, so in a loop, this would result in a single SVE instruction, which should have lower latency.
1 parent e974c65 commit cbb9b0e

File tree

2 files changed

+122
-2
lines changed

2 files changed

+122
-2
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11330,9 +11330,10 @@ SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
1133011330
break;
1133111331
}
1133211332

11333+
// Note: This lowering only overrides NEON for v1i64 and v2i64, where we
11334+
// prefer using SVE if available.
1133311335
if (VT.isScalableVector() ||
11334-
useSVEForFixedLengthVectorVT(
11335-
VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
11336+
useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) {
1133611337
switch (Opcode) {
1133711338
default:
1133811339
llvm_unreachable("Wrong instruction");
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s
3+
; RUN: llc < %s -mtriple=aarch64 -mattr=+neon,+sve | FileCheck %s --check-prefix=CHECK-SVE
4+
5+
define <2 x i64> @smax_v2i64(<2 x i64> %a, <2 x i64> %b){
6+
; CHECK-LABEL: smax_v2i64:
7+
; CHECK: // %bb.0: // %entry
8+
; CHECK-NEXT: cmgt v2.2d, v0.2d, v1.2d
9+
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
10+
; CHECK-NEXT: ret
11+
;
12+
; CHECK-SVE-LABEL: smax_v2i64:
13+
; CHECK-SVE: // %bb.0: // %entry
14+
; CHECK-SVE-NEXT: ptrue p0.d, vl2
15+
; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 def $z0
16+
; CHECK-SVE-NEXT: // kill: def $q1 killed $q1 def $z1
17+
; CHECK-SVE-NEXT: smax z0.d, p0/m, z0.d, z1.d
18+
; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
19+
; CHECK-SVE-NEXT: ret
20+
entry:
21+
%0 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a, <2 x i64> %b)
22+
ret <2 x i64> %0
23+
}
24+
25+
define <2 x i64> @smin_v2i64(<2 x i64> %a, <2 x i64> %b) {
26+
; CHECK-LABEL: smin_v2i64:
27+
; CHECK: // %bb.0: // %entry
28+
; CHECK-NEXT: cmgt v2.2d, v1.2d, v0.2d
29+
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
30+
; CHECK-NEXT: ret
31+
;
32+
; CHECK-SVE-LABEL: smin_v2i64:
33+
; CHECK-SVE: // %bb.0: // %entry
34+
; CHECK-SVE-NEXT: ptrue p0.d, vl2
35+
; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 def $z0
36+
; CHECK-SVE-NEXT: // kill: def $q1 killed $q1 def $z1
37+
; CHECK-SVE-NEXT: smin z0.d, p0/m, z0.d, z1.d
38+
; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
39+
; CHECK-SVE-NEXT: ret
40+
entry:
41+
%0 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %a, <2 x i64> %b)
42+
ret <2 x i64> %0
43+
}
44+
45+
define <2 x i64> @umax_v2i64(<2 x i64> %a, <2 x i64> %b){
46+
; CHECK-LABEL: umax_v2i64:
47+
; CHECK: // %bb.0: // %entry
48+
; CHECK-NEXT: cmhi v2.2d, v0.2d, v1.2d
49+
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
50+
; CHECK-NEXT: ret
51+
;
52+
; CHECK-SVE-LABEL: umax_v2i64:
53+
; CHECK-SVE: // %bb.0: // %entry
54+
; CHECK-SVE-NEXT: ptrue p0.d, vl2
55+
; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 def $z0
56+
; CHECK-SVE-NEXT: // kill: def $q1 killed $q1 def $z1
57+
; CHECK-SVE-NEXT: umax z0.d, p0/m, z0.d, z1.d
58+
; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
59+
; CHECK-SVE-NEXT: ret
60+
entry:
61+
%0 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %a, <2 x i64> %b)
62+
ret <2 x i64> %0
63+
}
64+
65+
define <2 x i64> @umin_v2i64(<2 x i64> %a, <2 x i64> %b) {
66+
; CHECK-LABEL: umin_v2i64:
67+
; CHECK: // %bb.0: // %entry
68+
; CHECK-NEXT: cmhi v2.2d, v1.2d, v0.2d
69+
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
70+
; CHECK-NEXT: ret
71+
;
72+
; CHECK-SVE-LABEL: umin_v2i64:
73+
; CHECK-SVE: // %bb.0: // %entry
74+
; CHECK-SVE-NEXT: ptrue p0.d, vl2
75+
; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 def $z0
76+
; CHECK-SVE-NEXT: // kill: def $q1 killed $q1 def $z1
77+
; CHECK-SVE-NEXT: umin z0.d, p0/m, z0.d, z1.d
78+
; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
79+
; CHECK-SVE-NEXT: ret
80+
entry:
81+
%0 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %a, <2 x i64> %b)
82+
ret <2 x i64> %0
83+
}
84+
85+
define <1 x i64> @smax_v1i64(<1 x i64> %a, <1 x i64> %b){
86+
; CHECK-LABEL: smax_v1i64:
87+
; CHECK: // %bb.0: // %entry
88+
; CHECK-NEXT: cmgt d2, d0, d1
89+
; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
90+
; CHECK-NEXT: ret
91+
;
92+
; CHECK-SVE-LABEL: smax_v1i64:
93+
; CHECK-SVE: // %bb.0: // %entry
94+
; CHECK-SVE-NEXT: ptrue p0.d, vl1
95+
; CHECK-SVE-NEXT: // kill: def $d0 killed $d0 def $z0
96+
; CHECK-SVE-NEXT: // kill: def $d1 killed $d1 def $z1
97+
; CHECK-SVE-NEXT: smax z0.d, p0/m, z0.d, z1.d
98+
; CHECK-SVE-NEXT: // kill: def $d0 killed $d0 killed $z0
99+
; CHECK-SVE-NEXT: ret
100+
entry:
101+
%0 = call <1 x i64> @llvm.smax.v2i64(<1 x i64> %a, <1 x i64> %b)
102+
ret <1 x i64> %0
103+
}
104+
105+
; This is legal for Neon, so this should use the Neon smax.
106+
define <4 x i32> @smax_v4i32(<4 x i32> %a, <4 x i32> %b){
107+
; CHECK-LABEL: smax_v4i32:
108+
; CHECK: // %bb.0: // %entry
109+
; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s
110+
; CHECK-NEXT: ret
111+
;
112+
; CHECK-SVE-LABEL: smax_v4i32:
113+
; CHECK-SVE: // %bb.0: // %entry
114+
; CHECK-SVE-NEXT: smax v0.4s, v0.4s, v1.4s
115+
; CHECK-SVE-NEXT: ret
116+
entry:
117+
%0 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b)
118+
ret <4 x i32> %0
119+
}

0 commit comments

Comments
 (0)