Skip to content

Commit 8c72b2a

Browse files
authored
[X86] LowervXi8MulWithUNPCK - remove special case constant folding handling (#163567)
Leave this to shuffle folding instead.
1 parent 8db1aab commit 8c72b2a

11 files changed

+157
-194
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 8 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -29755,65 +29755,30 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
2975529755
const X86Subtarget &Subtarget,
2975629756
SelectionDAG &DAG,
2975729757
SDValue *Low = nullptr) {
29758-
unsigned NumElts = VT.getVectorNumElements();
29759-
2976029758
// For vXi8 we will unpack the low and high half of each 128 bit lane to widen
2976129759
// to a vXi16 type. Do the multiplies, shift the results and pack the half
2976229760
// lane results back together.
2976329761

2976429762
// We'll take different approaches for signed and unsigned.
29765-
// For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29766-
// and use pmullw to calculate the full 16-bit product.
29763+
// For unsigned we'll use punpcklbw/punpckhbw to zero extend the bytes to
29764+
// words and use pmullw to calculate the full 16-bit product.
2976729765
// For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
2976829766
// shift them left into the upper byte of each word. This allows us to use
2976929767
// pmulhw to calculate the full 16-bit product. This trick means we don't
2977029768
// need to sign extend the bytes to use pmullw.
29771-
29772-
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29769+
MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
2977329770
SDValue Zero = DAG.getConstant(0, dl, VT);
2977429771

29775-
SDValue ALo, AHi;
29772+
SDValue ALo, AHi, BLo, BHi;
2977629773
if (IsSigned) {
2977729774
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29778-
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29779-
} else {
29780-
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29781-
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29782-
}
29783-
29784-
SDValue BLo, BHi;
29785-
if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29786-
// If the RHS is a constant, manually unpackl/unpackh and extend.
29787-
SmallVector<SDValue, 16> LoOps, HiOps;
29788-
for (unsigned i = 0; i != NumElts; i += 16) {
29789-
for (unsigned j = 0; j != 8; ++j) {
29790-
SDValue LoOp = B.getOperand(i + j);
29791-
SDValue HiOp = B.getOperand(i + j + 8);
29792-
29793-
if (IsSigned) {
29794-
LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29795-
HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29796-
LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29797-
DAG.getConstant(8, dl, MVT::i16));
29798-
HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29799-
DAG.getConstant(8, dl, MVT::i16));
29800-
} else {
29801-
LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29802-
HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29803-
}
29804-
29805-
LoOps.push_back(LoOp);
29806-
HiOps.push_back(HiOp);
29807-
}
29808-
}
29809-
29810-
BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29811-
BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29812-
} else if (IsSigned) {
2981329775
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29776+
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
2981429777
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
2981529778
} else {
29779+
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
2981629780
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29781+
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
2981729782
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
2981829783
}
2981929784

@@ -29826,7 +29791,7 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
2982629791
if (Low)
2982729792
*Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
2982829793

29829-
return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29794+
return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf=*/true);
2983029795
}
2983129796

2983229797
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,

llvm/test/CodeGen/X86/combine-sdiv.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2927,7 +2927,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
29272927
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
29282928
; SSE2-NEXT: pxor %xmm3, %xmm3
29292929
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2930-
; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,0,0,0,0,0,0,37632]
2930+
; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147]
29312931
; SSE2-NEXT: psrlw $8, %xmm3
29322932
; SSE2-NEXT: packuswb %xmm3, %xmm1
29332933
; SSE2-NEXT: paddb %xmm1, %xmm0
@@ -2947,7 +2947,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
29472947
; SSE41-NEXT: pxor %xmm1, %xmm1
29482948
; SSE41-NEXT: pxor %xmm2, %xmm2
29492949
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2950-
; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,37632]
2950+
; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147]
29512951
; SSE41-NEXT: psrlw $8, %xmm2
29522952
; SSE41-NEXT: packuswb %xmm2, %xmm1
29532953
; SSE41-NEXT: paddb %xmm0, %xmm1
@@ -2971,7 +2971,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
29712971
; AVX1: # %bb.0:
29722972
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
29732973
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2974-
; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,37632]
2974+
; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147]
29752975
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
29762976
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
29772977
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1
@@ -3044,7 +3044,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
30443044
; XOP: # %bb.0:
30453045
; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
30463046
; XOP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
3047-
; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,37632]
3047+
; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147]
30483048
; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15]
30493049
; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
30503050
; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1

llvm/test/CodeGen/X86/combine-udiv.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -665,14 +665,12 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
665665
;
666666
; XOP-LABEL: combine_vec_udiv_nonuniform4:
667667
; XOP: # %bb.0:
668-
; XOP-NEXT: movl $171, %eax
668+
; XOP-NEXT: movl $249, %eax
669669
; XOP-NEXT: vmovd %eax, %xmm1
670670
; XOP-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
671-
; XOP-NEXT: vpmullw %xmm1, %xmm2, %xmm1
672-
; XOP-NEXT: vpsrlw $8, %xmm1, %xmm1
673-
; XOP-NEXT: movl $249, %eax
674-
; XOP-NEXT: vmovd %eax, %xmm2
675-
; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1
671+
; XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [171,0,0,0]
672+
; XOP-NEXT: vpsrlw $8, %xmm2, %xmm2
673+
; XOP-NEXT: vpshlb %xmm1, %xmm2, %xmm1
676674
; XOP-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
677675
; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
678676
; XOP-NEXT: retq

llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) {
99
; AVX256BW: # %bb.0:
1010
; AVX256BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
1111
; AVX256BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
12-
; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
12+
; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
1313
; AVX256BW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
1414
; AVX256BW-NEXT: vpsrlw $8, %ymm2, %ymm2
1515
; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]

llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2335,10 +2335,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
23352335
; CHECK-AVX1: # %bb.0:
23362336
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
23372337
; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2338-
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [34048,34048,26368,37632,21760,33024,22016,35072]
2338+
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,133,0,133,0,103,0,147,0,85,0,129,0,86,0,137]
23392339
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
23402340
; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2341-
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [20224,26368,6912,30976,33024,33024,33024,12032]
2341+
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47]
23422342
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
23432343
; CHECK-AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm4
23442344
; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm5 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0]
@@ -2369,10 +2369,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
23692369
; CHECK-AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm4
23702370
; CHECK-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
23712371
; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2372-
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [2304,0,10496,37632,33024,33024,21760,36096]
2372+
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [0,9,0,0,0,41,0,147,0,129,0,129,0,85,0,141]
23732373
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
23742374
; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2375-
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 # [22016,24320,37632,11008,12544,32512,16640,37632]
2375+
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 # [0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147]
23762376
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
23772377
; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6
23782378
; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm7 # [0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0]
@@ -2417,10 +2417,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
24172417
; CHECK-AVX2: # %bb.0:
24182418
; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
24192419
; CHECK-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
2420-
; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [34048,34048,26368,37632,21760,33024,22016,35072,2304,0,10496,37632,33024,33024,21760,36096]
2420+
; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,133,0,133,0,103,0,147,0,85,0,129,0,86,0,137,0,9,0,0,0,41,0,147,0,129,0,129,0,85,0,141]
24212421
; CHECK-AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
24222422
; CHECK-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
2423-
; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [20224,26368,6912,30976,33024,33024,33024,12032,22016,24320,37632,11008,12544,32512,16640,37632]
2423+
; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47,0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147]
24242424
; CHECK-AVX2-NEXT: vpsrlw $8, %ymm4, %ymm4
24252425
; CHECK-AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
24262426
; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0]

0 commit comments

Comments
 (0)