-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[VPlan] Merge fcmp uno feeding AnyOf.
#166823
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesFold This pattern is generated to check if any vector lane is NaN, and combining multiple compares is beneficial on architectures that have dedicated instructions. Alive2 Proof: https://alive2.llvm.org/ce/z/vA_aoM Combine suggested as part of #161735 Full diff: https://github.com/llvm/llvm-project/pull/166823.diff 5 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index b57c44872c1b6..8b2931637113f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -417,6 +417,10 @@ m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) {
return m_VPInstruction<VPInstruction::BranchOnCount>(Op0, Op1);
}
+inline VPInstruction_match<VPInstruction::AnyOf> m_AnyOf() {
+ return m_VPInstruction<VPInstruction::AnyOf>();
+}
+
template <typename Op0_t>
inline VPInstruction_match<VPInstruction::AnyOf, Op0_t>
m_AnyOf(const Op0_t &Op0) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 48bd697397f41..937c420dcd6ee 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1221,6 +1221,29 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
}
}
+ // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
+ // any-of (fcmp uno %A, %B), ...
+ if (match(Def, m_AnyOf()) && Def->getNumOperands() % 2 == 0) {
+ SmallVector<VPValue *, 4> NewOps;
+ unsigned NumOps = Def->getNumOperands();
+ for (unsigned I = 0; I < NumOps; I += 2) {
+ VPValue *A, *B;
+ if (!match(
+ Def->getOperand(I),
+ m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(A), m_Deferred(A))) ||
+ !match(Def->getOperand(I + 1),
+ m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(B), m_Deferred(B))))
+ break;
+
+ NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO, A, B));
+ }
+
+ if (NewOps.size() == NumOps / 2) {
+ VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
+ return Def->replaceAllUsesWith(NewAnyOf);
+ }
+ }
+
// Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
index 3c83c01929aae..f4e5085c604f6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
@@ -60,12 +60,9 @@ define float @fmaxnum(ptr %src, i64 %n) {
; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
-; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP4]]
-; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP18]], [[TMP15]]
-; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT: [[TMP18:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP19:%.*]] = freeze <4 x i1> [[TMP18]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP19]])
; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
index 711a9cd03ac15..89b5d1395a4a0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
@@ -60,12 +60,9 @@ define float @fminnum(ptr %src, i64 %n) {
; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
-; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]]
-; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]]
-; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT: [[TMP15:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP19:%.*]] = freeze <4 x i1> [[TMP15]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP19]])
; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
index af648df9fc5c7..eab3492ebe2fa 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
@@ -60,12 +60,9 @@ define float @fmaxnum(ptr %src, i64 %n) {
; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
-; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]]
-; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]]
-; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT: [[TMP15:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP19:%.*]] = freeze <4 x i1> [[TMP15]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP19]])
; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
|
Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... -> any-of (fcmp uno %A, %B), ... This pattern is generated to check if any vector lane is NaN, and combining multiple compares is beneficial on architectures that have dedicated instructions. Alive2 Proof: https://alive2.llvm.org/ce/z/vA_aoM
c707451 to
c83adad
Compare
Fold
any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
any-of (fcmp uno %A, %B), ...
This pattern is generated to check if any vector lane is NaN, and combining multiple compares is beneficial on architectures that have dedicated instructions.
Alive2 Proof: https://alive2.llvm.org/ce/z/vA_aoM
Combine suggested as part of #161735