From c83adad3980bf9fb80d1410dee98d71c7606de4f Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 6 Nov 2025 12:42:04 +0000 Subject: [PATCH] [VPlan] Merge `fcmp uno` feeding AnyOf. Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... -> any-of (fcmp uno %A, %B), ... This pattern is generated to check if any vector lane is NaN, and combining multiple compares is beneficial on architectures that have dedicated instructions. Alive2 Proof: https://alive2.llvm.org/ce/z/vA_aoM --- .../Transforms/Vectorize/VPlanPatternMatch.h | 4 ++++ .../Transforms/Vectorize/VPlanTransforms.cpp | 23 +++++++++++++++++++ .../AArch64/fmax-without-fast-math-flags.ll | 7 ++---- .../AArch64/fmin-without-fast-math-flags.ll | 7 ++---- ...fmax-without-fast-math-flags-interleave.ll | 7 ++---- 5 files changed, 33 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index b57c44872c1b6..8b2931637113f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -417,6 +417,10 @@ m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) { return m_VPInstruction(Op0, Op1); } +inline VPInstruction_match m_AnyOf() { + return m_VPInstruction(); +} + template inline VPInstruction_match m_AnyOf(const Op0_t &Op0) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 634df51a12965..0c04cd6174a7a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1221,6 +1221,29 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { } } + // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... -> + // any-of (fcmp uno %A, %B), ... + if (match(Def, m_AnyOf()) && Def->getNumOperands() % 2 == 0) { + SmallVector NewOps; + unsigned NumOps = Def->getNumOperands(); + for (unsigned I = 0; I < NumOps; I += 2) { + VPValue *A, *B; + if (!match( + Def->getOperand(I), + m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(A), m_Deferred(A))) || + !match(Def->getOperand(I + 1), + m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(B), m_Deferred(B)))) + break; + + NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO, A, B)); + } + + if (NewOps.size() == NumOps / 2) { + VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps); + return Def->replaceAllUsesWith(NewAnyOf); + } + } + // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0. if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) || match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) && diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll index 7e58d9d6a8ec9..b65a7e999a780 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll @@ -59,11 +59,8 @@ define float @fmaxnum(ptr %src, i64 %n) { ; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 -; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] -; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP3]] -; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP18]], [[TMP15]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll index 1cc4c152649b4..193424d3eb70a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll @@ -59,11 +59,8 @@ define float @fminnum(ptr %src, i64 %n) { ; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 -; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] -; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]] -; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]] diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll index 01fab87209a35..ebd4ab54e9b74 100644 --- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll @@ -59,11 +59,8 @@ define float @fmaxnum(ptr %src, i64 %n) { ; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 -; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] -; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]] -; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]