From 93a6be03b51027e351458382ff8c05e65fbacbef Mon Sep 17 00:00:00 2001 From: Julian Nagele Date: Fri, 27 Jun 2025 15:32:18 +0100 Subject: [PATCH 1/4] Add test for inserting zero into vector lane of fmul result --- llvm/test/CodeGen/AArch64/arm64-vmul.ll | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll index 937a17ca6c1e0..27c5668b06d02 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -1186,6 +1186,16 @@ define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind { ret double %res } +define <4 x float> @fmul_insert_zero(<4 x float> %A, <4 x float> %B) { +; CHECK-LABEL: fmul_insert_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: fmul.4s v0, v0, v1 +; CHECK-NEXT: mov.s v0[3], wzr +; CHECK-NEXT: ret + %mul = fmul <4 x float> %A, %B + %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3 + ret <4 x float> %mul_set_lane +} define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind { From e08422d223607519aeb742e774b180b148930d02 Mon Sep 17 00:00:00 2001 From: Julian Nagele Date: Fri, 27 Jun 2025 15:39:14 +0100 Subject: [PATCH 2/4] [ISel] Commute FMUL and inserting zero into vector lane --- .../Target/AArch64/AArch64ISelLowering.cpp | 32 +++++++++++++++++++ llvm/test/CodeGen/AArch64/arm64-vmul.ll | 2 +- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c8b1eafd35495..867833f67b822 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -26268,11 +26268,43 @@ static SDValue removeRedundantInsertVectorElt(SDNode *N) { return ExtractVec; } +static SDValue commuteInsertVectorEltFMul(SDNode *N, SelectionDAG &DAG) { + assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!"); + SDValue InsertVec = N->getOperand(0); + SDValue InsertVal = N->getOperand(1); + SDValue InsertIdx = N->getOperand(2); + + // Only handle constant 0 insertion... + if (!(isNullConstant(InsertVal) || isNullFPConstant(InsertVal))) + return SDValue(); + // ... into the result of an FMUL. + if (InsertVec.getOpcode() != ISD::FMUL) + return SDValue(); + + // Insert into the operand of FMUL instead. + SDValue FMulOp = InsertVec.getOperand(0); + + if (!InsertVec.hasOneUse() || !FMulOp.hasOneUse()) + return SDValue(); + + SDValue InsertOp = + DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), FMulOp.getValueType(), + FMulOp, InsertVal, InsertIdx); + SDValue FMul = + DAG.getNode(ISD::FMUL, SDLoc(InsertVec), InsertVec.getValueType(), + InsertOp, InsertVec.getOperand(1)); + DAG.ReplaceAllUsesWith(N, &FMul); + return FMul; +} + static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { if (SDValue Res = removeRedundantInsertVectorElt(N)) return Res; + if (SDValue Res = commuteInsertVectorEltFMul(N, DCI.DAG)) + return Res; + return performPostLD1Combine(N, DCI, true); } diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll index 27c5668b06d02..1e33f81b9b835 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -1189,8 +1189,8 @@ define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind { define <4 x float> @fmul_insert_zero(<4 x float> %A, <4 x float> %B) { ; CHECK-LABEL: fmul_insert_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: fmul.4s v0, v0, v1 ; CHECK-NEXT: mov.s v0[3], wzr +; CHECK-NEXT: fmul.4s v0, v0, v1 ; CHECK-NEXT: ret %mul = fmul <4 x float> %A, %B %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3 From e3b38a9faff4ab3cd76506720f5d81e64b3bb871 Mon Sep 17 00:00:00 2001 From: Julian Nagele Date: Fri, 11 Jul 2025 18:13:12 +0100 Subject: [PATCH 3/4] fixup! [ISel] Commute FMUL and inserting zero into vector lane --- .../Target/AArch64/AArch64ISelLowering.cpp | 16 ++++++--- llvm/test/CodeGen/AArch64/arm64-vmul.ll | 36 +++++++++++++++++++ 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 867833f67b822..bb32ef5e13c10 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -26283,16 +26283,24 @@ static SDValue commuteInsertVectorEltFMul(SDNode *N, SelectionDAG &DAG) { // Insert into the operand of FMUL instead. SDValue FMulOp = InsertVec.getOperand(0); + SDValue FMulOp2 = InsertVec.getOperand(1); - if (!InsertVec.hasOneUse() || !FMulOp.hasOneUse()) + if (!InsertVec.hasOneUse()) return SDValue(); + if (!InsertVec->isOnlyUserOf(FMulOp.getNode())) { + if (!InsertVec->isOnlyUserOf(FMulOp2.getNode())) + return SDValue(); + std::swap(FMulOp, FMulOp2); + } + SDValue InsertOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), FMulOp.getValueType(), FMulOp, InsertVal, InsertIdx); - SDValue FMul = - DAG.getNode(ISD::FMUL, SDLoc(InsertVec), InsertVec.getValueType(), - InsertOp, InsertVec.getOperand(1)); + if (FMulOp == FMulOp2) + FMulOp2 = InsertOp; + SDValue FMul = DAG.getNode(ISD::FMUL, SDLoc(InsertVec), + InsertVec.getValueType(), InsertOp, FMulOp2); DAG.ReplaceAllUsesWith(N, &FMul); return FMul; } diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll index 1e33f81b9b835..9f48cb788d5d2 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -1197,6 +1197,42 @@ define <4 x float> @fmul_insert_zero(<4 x float> %A, <4 x float> %B) { ret <4 x float> %mul_set_lane } +define <4 x float> @fmul_insert_zero_same(<4 x float> %A) { +; CHECK-LABEL: fmul_insert_zero_same: +; CHECK: // %bb.0: +; CHECK-NEXT: mov.s v0[3], wzr +; CHECK-NEXT: fmul.4s v0, v0, v0 +; CHECK-NEXT: ret + %mul = fmul <4 x float> %A, %A + %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3 + ret <4 x float> %mul_set_lane +} + +define <4 x float> @fmul_insert_zero1(<4 x float> %A, <4 x float> %B, <4 x float> %C) { +; CHECK-LABEL: fmul_insert_zero1: +; CHECK: // %bb.0: +; CHECK-NEXT: fsub.4s v0, v2, v0 +; CHECK-NEXT: mov.s v1[3], wzr +; CHECK-NEXT: fmul.4s v0, v1, v0 +; CHECK-NEXT: ret + %sub = fsub <4 x float> %C, %A + %mul = fmul <4 x float> %B, %sub + %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3 + ret <4 x float> %mul_set_lane +} + +define <4 x float> @fmul_insert_zero2(<4 x float> %A, <4 x float> %B) { +; CHECK-LABEL: fmul_insert_zero2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov.s v0[3], wzr +; CHECK-NEXT: fmul.4s v0, v0, v1 +; CHECK-NEXT: fsub.4s v0, v1, v0 +; CHECK-NEXT: ret + %mul = fmul <4 x float> %B, %A + %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3 + %sub = fsub <4 x float> %B, %mul_set_lane + ret <4 x float> %sub +} define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind { ; CHECK-LABEL: fmulx_lane_2s: From 9cd532764a8610b3885dcfaedcfc8c544ba211a0 Mon Sep 17 00:00:00 2001 From: Julian Nagele Date: Thu, 17 Jul 2025 18:50:11 +0100 Subject: [PATCH 4/4] fixup! fixup! [ISel] Commute FMUL and inserting zero into vector lane --- .../Target/AArch64/AArch64ISelLowering.cpp | 10 +++++- llvm/test/CodeGen/AArch64/arm64-vmul.ll | 32 ++++++++++++++----- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 98b4c8e487c53..5d0f87d2472d1 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -26494,9 +26494,16 @@ static SDValue commuteInsertVectorEltFMul(SDNode *N, SelectionDAG &DAG) { // Only handle constant 0 insertion... if (!(isNullConstant(InsertVal) || isNullFPConstant(InsertVal))) return SDValue(); - // ... into the result of an FMUL. + // ... into the result of an FMUL ... if (InsertVec.getOpcode() != ISD::FMUL) return SDValue(); + /// ... and only when x * 0 = 0. + auto Flags = InsertVec->getFlags(); + auto Options = DAG.getTarget().Options; + if ((!Options.NoNaNsFPMath && !Flags.hasNoNaNs()) || + (!Options.NoInfsFPMath && !Flags.hasNoInfs()) || + (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())) + return SDValue(); // Insert into the operand of FMUL instead. SDValue FMulOp = InsertVec.getOperand(0); @@ -26518,6 +26525,7 @@ static SDValue commuteInsertVectorEltFMul(SDNode *N, SelectionDAG &DAG) { FMulOp2 = InsertOp; SDValue FMul = DAG.getNode(ISD::FMUL, SDLoc(InsertVec), InsertVec.getValueType(), InsertOp, FMulOp2); + FMul->setFlags(Flags); DAG.ReplaceAllUsesWith(N, &FMul); return FMul; } diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll index 9f48cb788d5d2..21cbfef2992a6 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -1189,10 +1189,11 @@ define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind { define <4 x float> @fmul_insert_zero(<4 x float> %A, <4 x float> %B) { ; CHECK-LABEL: fmul_insert_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov.s v0[3], wzr +; CHECK-NEXT: movi d2, #0000000000000000 +; CHECK-NEXT: mov.s v0[3], v2[0] ; CHECK-NEXT: fmul.4s v0, v0, v1 ; CHECK-NEXT: ret - %mul = fmul <4 x float> %A, %B + %mul = fmul fast <4 x float> %A, %B %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3 ret <4 x float> %mul_set_lane } @@ -1200,10 +1201,11 @@ define <4 x float> @fmul_insert_zero(<4 x float> %A, <4 x float> %B) { define <4 x float> @fmul_insert_zero_same(<4 x float> %A) { ; CHECK-LABEL: fmul_insert_zero_same: ; CHECK: // %bb.0: -; CHECK-NEXT: mov.s v0[3], wzr +; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov.s v0[3], v1[0] ; CHECK-NEXT: fmul.4s v0, v0, v0 ; CHECK-NEXT: ret - %mul = fmul <4 x float> %A, %A + %mul = fmul fast <4 x float> %A, %A %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3 ret <4 x float> %mul_set_lane } @@ -1211,12 +1213,13 @@ define <4 x float> @fmul_insert_zero_same(<4 x float> %A) { define <4 x float> @fmul_insert_zero1(<4 x float> %A, <4 x float> %B, <4 x float> %C) { ; CHECK-LABEL: fmul_insert_zero1: ; CHECK: // %bb.0: +; CHECK-NEXT: movi d3, #0000000000000000 ; CHECK-NEXT: fsub.4s v0, v2, v0 -; CHECK-NEXT: mov.s v1[3], wzr +; CHECK-NEXT: mov.s v1[3], v3[0] ; CHECK-NEXT: fmul.4s v0, v1, v0 ; CHECK-NEXT: ret %sub = fsub <4 x float> %C, %A - %mul = fmul <4 x float> %B, %sub + %mul = fmul fast <4 x float> %B, %sub %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3 ret <4 x float> %mul_set_lane } @@ -1224,16 +1227,29 @@ define <4 x float> @fmul_insert_zero1(<4 x float> %A, <4 x float> %B, <4 x float define <4 x float> @fmul_insert_zero2(<4 x float> %A, <4 x float> %B) { ; CHECK-LABEL: fmul_insert_zero2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov.s v0[3], wzr +; CHECK-NEXT: movi d2, #0000000000000000 +; CHECK-NEXT: mov.s v0[3], v2[0] ; CHECK-NEXT: fmul.4s v0, v0, v1 ; CHECK-NEXT: fsub.4s v0, v1, v0 ; CHECK-NEXT: ret - %mul = fmul <4 x float> %B, %A + %mul = fmul fast <4 x float> %B, %A %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3 %sub = fsub <4 x float> %B, %mul_set_lane ret <4 x float> %sub } +define <4 x float> @fmul_insert_zero_nofast(<4 x float> %A, <4 x float> %B) { +; CHECK-LABEL: fmul_insert_zero_nofast: +; CHECK: // %bb.0: +; CHECK-NEXT: movi d2, #0000000000000000 +; CHECK-NEXT: fmul.4s v0, v0, v1 +; CHECK-NEXT: mov.s v0[3], v2[0] +; CHECK-NEXT: ret + %mul = fmul <4 x float> %A, %B + %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3 + ret <4 x float> %mul_set_lane +} + define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind { ; CHECK-LABEL: fmulx_lane_2s: ; CHECK: // %bb.0: