Skip to content

Commit c707451

Browse files
committed
[VPlan] Merge fcmp uno feeding AnyOf.
Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... -> any-of (fcmp uno %A, %B), ... This pattern is generated to check if any vector lane is NaN, and combining multiple compares is beneficial on architectures that have dedicated instructions. Alive2 Proof: https://alive2.llvm.org/ce/z/vA_aoM
1 parent 3c31cde commit c707451

File tree

5 files changed

+36
-18
lines changed

5 files changed

+36
-18
lines changed

llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,10 @@ m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) {
417417
return m_VPInstruction<VPInstruction::BranchOnCount>(Op0, Op1);
418418
}
419419

420+
inline VPInstruction_match<VPInstruction::AnyOf> m_AnyOf() {
421+
return m_VPInstruction<VPInstruction::AnyOf>();
422+
}
423+
420424
template <typename Op0_t>
421425
inline VPInstruction_match<VPInstruction::AnyOf, Op0_t>
422426
m_AnyOf(const Op0_t &Op0) {

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1221,6 +1221,29 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
12211221
}
12221222
}
12231223

1224+
// Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1225+
// any-of (fcmp uno %A, %B), ...
1226+
if (match(Def, m_AnyOf()) && Def->getNumOperands() % 2 == 0) {
1227+
SmallVector<VPValue *, 4> NewOps;
1228+
unsigned NumOps = Def->getNumOperands();
1229+
for (unsigned I = 0; I < NumOps; I += 2) {
1230+
VPValue *A, *B;
1231+
if (!match(
1232+
Def->getOperand(I),
1233+
m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(A), m_Deferred(A))) ||
1234+
!match(Def->getOperand(I + 1),
1235+
m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(B), m_Deferred(B))))
1236+
break;
1237+
1238+
NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO, A, B));
1239+
}
1240+
1241+
if (NewOps.size() == NumOps / 2) {
1242+
VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1243+
return Def->replaceAllUsesWith(NewAnyOf);
1244+
}
1245+
}
1246+
12241247
// Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
12251248
if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
12261249
match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&

llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,9 @@ define float @fmaxnum(ptr %src, i64 %n) {
6060
; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
6161
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
6262
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
63-
; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
64-
; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
65-
; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP3]]
66-
; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP4]]
67-
; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP18]], [[TMP15]]
68-
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
63+
; CHECK-NEXT: [[TMP18:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
64+
; CHECK-NEXT: [[TMP19:%.*]] = freeze <4 x i1> [[TMP18]]
65+
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP19]])
6966
; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
7067
; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
7168
; CHECK: [[MIDDLE_BLOCK]]:

llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,9 @@ define float @fminnum(ptr %src, i64 %n) {
6060
; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
6161
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
6262
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
63-
; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
64-
; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
65-
; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]]
66-
; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]]
67-
; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]]
68-
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
63+
; CHECK-NEXT: [[TMP15:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
64+
; CHECK-NEXT: [[TMP19:%.*]] = freeze <4 x i1> [[TMP15]]
65+
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP19]])
6966
; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
7067
; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
7168
; CHECK: [[MIDDLE_BLOCK]]:

llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,9 @@ define float @fmaxnum(ptr %src, i64 %n) {
6060
; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
6161
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
6262
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
63-
; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
64-
; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
65-
; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]]
66-
; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]]
67-
; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]]
68-
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
63+
; CHECK-NEXT: [[TMP15:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
64+
; CHECK-NEXT: [[TMP19:%.*]] = freeze <4 x i1> [[TMP15]]
65+
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP19]])
6966
; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
7067
; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
7168
; CHECK: [[MIDDLE_BLOCK]]:

0 commit comments

Comments
 (0)