From 93a6be03b51027e351458382ff8c05e65fbacbef Mon Sep 17 00:00:00 2001
From: Julian Nagele <j.nagele@apple.com>
Date: Fri, 27 Jun 2025 15:32:18 +0100
Subject: [PATCH 1/4] Add test for inserting zero into vector lane of fmul
 result

---
 llvm/test/CodeGen/AArch64/arm64-vmul.ll | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index 937a17ca6c1e0..27c5668b06d02 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -1186,6 +1186,16 @@ define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
   ret double %res
 }
 
+define <4 x float> @fmul_insert_zero(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fmul_insert_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul.4s v0, v0, v1
+; CHECK-NEXT:    mov.s v0[3], wzr
+; CHECK-NEXT:    ret
+  %mul = fmul <4 x float> %A, %B
+  %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3
+  ret <4 x float> %mul_set_lane
+}
 
 
 define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind {

From e08422d223607519aeb742e774b180b148930d02 Mon Sep 17 00:00:00 2001
From: Julian Nagele <j.nagele@apple.com>
Date: Fri, 27 Jun 2025 15:39:14 +0100
Subject: [PATCH 2/4] [ISel] Commute FMUL and inserting zero into vector lane

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 32 +++++++++++++++++++
 llvm/test/CodeGen/AArch64/arm64-vmul.ll       |  2 +-
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c8b1eafd35495..867833f67b822 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26268,11 +26268,43 @@ static SDValue removeRedundantInsertVectorElt(SDNode *N) {
   return ExtractVec;
 }
 
+static SDValue commuteInsertVectorEltFMul(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
+  SDValue InsertVec = N->getOperand(0);
+  SDValue InsertVal = N->getOperand(1);
+  SDValue InsertIdx = N->getOperand(2);
+
+  // Only handle constant 0 insertion...
+  if (!(isNullConstant(InsertVal) || isNullFPConstant(InsertVal)))
+    return SDValue();
+  // ... into the result of an FMUL.
+  if (InsertVec.getOpcode() != ISD::FMUL)
+    return SDValue();
+
+  // Insert into the operand of FMUL instead.
+  SDValue FMulOp = InsertVec.getOperand(0);
+
+  if (!InsertVec.hasOneUse() || !FMulOp.hasOneUse())
+    return SDValue();
+
+  SDValue InsertOp =
+      DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), FMulOp.getValueType(),
+                  FMulOp, InsertVal, InsertIdx);
+  SDValue FMul =
+      DAG.getNode(ISD::FMUL, SDLoc(InsertVec), InsertVec.getValueType(),
+                  InsertOp, InsertVec.getOperand(1));
+  DAG.ReplaceAllUsesWith(N, &FMul);
+  return FMul;
+}
+
 static SDValue
 performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   if (SDValue Res = removeRedundantInsertVectorElt(N))
     return Res;
 
+  if (SDValue Res = commuteInsertVectorEltFMul(N, DCI.DAG))
+    return Res;
+
   return performPostLD1Combine(N, DCI, true);
 }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index 27c5668b06d02..1e33f81b9b835 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -1189,8 +1189,8 @@ define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
 define <4 x float> @fmul_insert_zero(<4 x float> %A, <4 x float> %B) {
 ; CHECK-LABEL: fmul_insert_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmul.4s v0, v0, v1
 ; CHECK-NEXT:    mov.s v0[3], wzr
+; CHECK-NEXT:    fmul.4s v0, v0, v1
 ; CHECK-NEXT:    ret
   %mul = fmul <4 x float> %A, %B
   %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3

From e3b38a9faff4ab3cd76506720f5d81e64b3bb871 Mon Sep 17 00:00:00 2001
From: Julian Nagele <j.nagele@apple.com>
Date: Fri, 11 Jul 2025 18:13:12 +0100
Subject: [PATCH 3/4] fixup! [ISel] Commute FMUL and inserting zero into vector
 lane

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 16 ++++++---
 llvm/test/CodeGen/AArch64/arm64-vmul.ll       | 36 +++++++++++++++++++
 2 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 867833f67b822..bb32ef5e13c10 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26283,16 +26283,24 @@ static SDValue commuteInsertVectorEltFMul(SDNode *N, SelectionDAG &DAG) {
 
   // Insert into the operand of FMUL instead.
   SDValue FMulOp = InsertVec.getOperand(0);
+  SDValue FMulOp2 = InsertVec.getOperand(1);
 
-  if (!InsertVec.hasOneUse() || !FMulOp.hasOneUse())
+  if (!InsertVec.hasOneUse())
     return SDValue();
 
+  if (!InsertVec->isOnlyUserOf(FMulOp.getNode())) {
+    if (!InsertVec->isOnlyUserOf(FMulOp2.getNode()))
+      return SDValue();
+    std::swap(FMulOp, FMulOp2);
+  }
+
   SDValue InsertOp =
       DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), FMulOp.getValueType(),
                   FMulOp, InsertVal, InsertIdx);
-  SDValue FMul =
-      DAG.getNode(ISD::FMUL, SDLoc(InsertVec), InsertVec.getValueType(),
-                  InsertOp, InsertVec.getOperand(1));
+  if (FMulOp == FMulOp2)
+    FMulOp2 = InsertOp;
+  SDValue FMul = DAG.getNode(ISD::FMUL, SDLoc(InsertVec),
+                             InsertVec.getValueType(), InsertOp, FMulOp2);
   DAG.ReplaceAllUsesWith(N, &FMul);
   return FMul;
 }
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index 1e33f81b9b835..9f48cb788d5d2 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -1197,6 +1197,42 @@ define <4 x float> @fmul_insert_zero(<4 x float> %A, <4 x float> %B) {
   ret <4 x float> %mul_set_lane
 }
 
+define <4 x float> @fmul_insert_zero_same(<4 x float> %A) {
+; CHECK-LABEL: fmul_insert_zero_same:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov.s v0[3], wzr
+; CHECK-NEXT:    fmul.4s v0, v0, v0
+; CHECK-NEXT:    ret
+  %mul = fmul <4 x float> %A, %A
+  %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3
+  ret <4 x float> %mul_set_lane
+}
+
+define <4 x float> @fmul_insert_zero1(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
+; CHECK-LABEL: fmul_insert_zero1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsub.4s v0, v2, v0
+; CHECK-NEXT:    mov.s v1[3], wzr
+; CHECK-NEXT:    fmul.4s v0, v1, v0
+; CHECK-NEXT:    ret
+  %sub = fsub <4 x float> %C, %A
+  %mul = fmul <4 x float> %B, %sub
+  %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3
+  ret <4 x float> %mul_set_lane
+}
+
+define <4 x float> @fmul_insert_zero2(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fmul_insert_zero2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov.s v0[3], wzr
+; CHECK-NEXT:    fmul.4s v0, v0, v1
+; CHECK-NEXT:    fsub.4s v0, v1, v0
+; CHECK-NEXT:    ret
+  %mul = fmul <4 x float> %B, %A
+  %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3
+  %sub = fsub <4 x float> %B, %mul_set_lane
+  ret <4 x float> %sub
+}
 
 define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind {
 ; CHECK-LABEL: fmulx_lane_2s:

From 9cd532764a8610b3885dcfaedcfc8c544ba211a0 Mon Sep 17 00:00:00 2001
From: Julian Nagele <j.nagele@apple.com>
Date: Thu, 17 Jul 2025 18:50:11 +0100
Subject: [PATCH 4/4] fixup! fixup! [ISel] Commute FMUL and inserting zero into
 vector lane

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 10 +++++-
 llvm/test/CodeGen/AArch64/arm64-vmul.ll       | 32 ++++++++++++++-----
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 98b4c8e487c53..5d0f87d2472d1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26494,9 +26494,16 @@ static SDValue commuteInsertVectorEltFMul(SDNode *N, SelectionDAG &DAG) {
   // Only handle constant 0 insertion...
   if (!(isNullConstant(InsertVal) || isNullFPConstant(InsertVal)))
     return SDValue();
-  // ... into the result of an FMUL.
+  // ... into the result of an FMUL ...
   if (InsertVec.getOpcode() != ISD::FMUL)
     return SDValue();
+  /// ... and only when x * 0 = 0.
+  auto Flags = InsertVec->getFlags();
+  auto Options = DAG.getTarget().Options;
+  if ((!Options.NoNaNsFPMath && !Flags.hasNoNaNs()) ||
+      (!Options.NoInfsFPMath && !Flags.hasNoInfs()) ||
+      (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros()))
+    return SDValue();
 
   // Insert into the operand of FMUL instead.
   SDValue FMulOp = InsertVec.getOperand(0);
@@ -26518,6 +26525,7 @@ static SDValue commuteInsertVectorEltFMul(SDNode *N, SelectionDAG &DAG) {
     FMulOp2 = InsertOp;
   SDValue FMul = DAG.getNode(ISD::FMUL, SDLoc(InsertVec),
                              InsertVec.getValueType(), InsertOp, FMulOp2);
+  FMul->setFlags(Flags);
   DAG.ReplaceAllUsesWith(N, &FMul);
   return FMul;
 }
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index 9f48cb788d5d2..21cbfef2992a6 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -1189,10 +1189,11 @@ define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
 define <4 x float> @fmul_insert_zero(<4 x float> %A, <4 x float> %B) {
 ; CHECK-LABEL: fmul_insert_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov.s v0[3], wzr
+; CHECK-NEXT:    movi d2, #0000000000000000
+; CHECK-NEXT:    mov.s v0[3], v2[0]
 ; CHECK-NEXT:    fmul.4s v0, v0, v1
 ; CHECK-NEXT:    ret
-  %mul = fmul <4 x float> %A, %B
+  %mul = fmul fast <4 x float> %A, %B
   %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3
   ret <4 x float> %mul_set_lane
 }
@@ -1200,10 +1201,11 @@ define <4 x float> @fmul_insert_zero(<4 x float> %A, <4 x float> %B) {
 define <4 x float> @fmul_insert_zero_same(<4 x float> %A) {
 ; CHECK-LABEL: fmul_insert_zero_same:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov.s v0[3], wzr
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    mov.s v0[3], v1[0]
 ; CHECK-NEXT:    fmul.4s v0, v0, v0
 ; CHECK-NEXT:    ret
-  %mul = fmul <4 x float> %A, %A
+  %mul = fmul fast <4 x float> %A, %A
   %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3
   ret <4 x float> %mul_set_lane
 }
@@ -1211,12 +1213,13 @@ define <4 x float> @fmul_insert_zero_same(<4 x float> %A) {
 define <4 x float> @fmul_insert_zero1(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
 ; CHECK-LABEL: fmul_insert_zero1:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d3, #0000000000000000
 ; CHECK-NEXT:    fsub.4s v0, v2, v0
-; CHECK-NEXT:    mov.s v1[3], wzr
+; CHECK-NEXT:    mov.s v1[3], v3[0]
 ; CHECK-NEXT:    fmul.4s v0, v1, v0
 ; CHECK-NEXT:    ret
   %sub = fsub <4 x float> %C, %A
-  %mul = fmul <4 x float> %B, %sub
+  %mul = fmul fast <4 x float> %B, %sub
   %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3
   ret <4 x float> %mul_set_lane
 }
@@ -1224,16 +1227,29 @@ define <4 x float> @fmul_insert_zero1(<4 x float> %A, <4 x float> %B, <4 x float
 define <4 x float> @fmul_insert_zero2(<4 x float> %A, <4 x float> %B) {
 ; CHECK-LABEL: fmul_insert_zero2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov.s v0[3], wzr
+; CHECK-NEXT:    movi d2, #0000000000000000
+; CHECK-NEXT:    mov.s v0[3], v2[0]
 ; CHECK-NEXT:    fmul.4s v0, v0, v1
 ; CHECK-NEXT:    fsub.4s v0, v1, v0
 ; CHECK-NEXT:    ret
-  %mul = fmul <4 x float> %B, %A
+  %mul = fmul fast <4 x float> %B, %A
   %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3
   %sub = fsub <4 x float> %B, %mul_set_lane
   ret <4 x float> %sub
 }
 
+define <4 x float> @fmul_insert_zero_nofast(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fmul_insert_zero_nofast:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d2, #0000000000000000
+; CHECK-NEXT:    fmul.4s v0, v0, v1
+; CHECK-NEXT:    mov.s v0[3], v2[0]
+; CHECK-NEXT:    ret
+  %mul = fmul <4 x float> %A, %B
+  %mul_set_lane = insertelement <4 x float> %mul, float 0.000000e+00, i64 3
+  ret <4 x float> %mul_set_lane
+}
+
 define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind {
 ; CHECK-LABEL: fmulx_lane_2s:
 ; CHECK:       // %bb.0: