@@ -921,26 +921,26 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
921921; AVX512-NEXT: vmovdqa (%rdx), %xmm2
922922; AVX512-NEXT: vmovdqa 16(%rdx), %xmm3
923923; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
924+ ; AVX512-NEXT: vmovdqa (%rsi), %xmm5
925+ ; AVX512-NEXT: vmovdqa 16(%rsi), %xmm6
926+ ; AVX512-NEXT: vmovdqa (%rdi), %xmm7
927+ ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm8
928+ ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
929+ ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11]
930+ ; AVX512-NEXT: vpermt2d %ymm4, %ymm10, %ymm9
924931; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
925- ; AVX512-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1
926- ; AVX512-NEXT: vmovdqa (%rsi), %xmm3
927- ; AVX512-NEXT: vmovdqa 16(%rsi), %xmm4
928- ; AVX512-NEXT: vmovdqa (%rdi), %xmm5
929- ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm6
930- ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
931- ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
932- ; AVX512-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4
933- ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27]
934- ; AVX512-NEXT: vpermt2d %zmm1, %zmm6, %zmm4
935- ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
932+ ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
933+ ; AVX512-NEXT: vpermt2d %ymm1, %ymm10, %ymm3
934+ ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm1
935+ ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
936+ ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
937+ ; AVX512-NEXT: vpermt2d %ymm3, %ymm10, %ymm4
936938; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
937- ; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
938- ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
939- ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
940- ; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1
941- ; AVX512-NEXT: vpermt2d %zmm0, %zmm6, %zmm1
942- ; AVX512-NEXT: vmovdqa64 %zmm1, (%r8)
943- ; AVX512-NEXT: vmovdqa64 %zmm4, 64(%r8)
939+ ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
940+ ; AVX512-NEXT: vpermt2d %ymm0, %ymm10, %ymm2
941+ ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm0
942+ ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8)
943+ ; AVX512-NEXT: vmovdqa64 %zmm1, 64(%r8)
944944; AVX512-NEXT: vzeroupper
945945; AVX512-NEXT: retq
946946;
@@ -951,26 +951,26 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
951951; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2
952952; AVX512-FCP-NEXT: vmovdqa 16(%rdx), %xmm3
953953; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
954+ ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm5
955+ ; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm6
956+ ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm7
957+ ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm8
958+ ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
959+ ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11]
960+ ; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm10, %ymm9
954961; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
955- ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1
956- ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3
957- ; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm4
958- ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm5
959- ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm6
960- ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
961- ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
962- ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4
963- ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27]
964- ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm4
965- ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
962+ ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
963+ ; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm10, %ymm3
964+ ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm1
965+ ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
966+ ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
967+ ; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm10, %ymm4
966968; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
967- ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
968- ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
969- ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
970- ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1
971- ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm1
972- ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r8)
973- ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
969+ ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
970+ ; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm10, %ymm2
971+ ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm0
972+ ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
973+ ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%r8)
974974; AVX512-FCP-NEXT: vzeroupper
975975; AVX512-FCP-NEXT: retq
976976;
@@ -981,26 +981,26 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
981981; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2
982982; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm3
983983; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
984+ ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm5
985+ ; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm6
986+ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm7
987+ ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm8
988+ ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
989+ ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11]
990+ ; AVX512DQ-NEXT: vpermt2d %ymm4, %ymm10, %ymm9
984991; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
985- ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1
986- ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm3
987- ; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm4
988- ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm5
989- ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm6
990- ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
991- ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
992- ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4
993- ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27]
994- ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm6, %zmm4
995- ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
992+ ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
993+ ; AVX512DQ-NEXT: vpermt2d %ymm1, %ymm10, %ymm3
994+ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm1
995+ ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
996+ ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
997+ ; AVX512DQ-NEXT: vpermt2d %ymm3, %ymm10, %ymm4
996998; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
997- ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
998- ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
999- ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1000- ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1
1001- ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm6, %zmm1
1002- ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r8)
1003- ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%r8)
999+ ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
1000+ ; AVX512DQ-NEXT: vpermt2d %ymm0, %ymm10, %ymm2
1001+ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm0
1002+ ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8)
1003+ ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%r8)
10041004; AVX512DQ-NEXT: vzeroupper
10051005; AVX512DQ-NEXT: retq
10061006;
@@ -1011,26 +1011,26 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
10111011; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
10121012; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdx), %xmm3
10131013; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1014+ ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm5
1015+ ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm6
1016+ ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm7
1017+ ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm8
1018+ ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
1019+ ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11]
1020+ ; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm10, %ymm9
10141021; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1015- ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1
1016- ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3
1017- ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm4
1018- ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm5
1019- ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm6
1020- ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
1021- ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
1022- ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4
1023- ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27]
1024- ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm4
1025- ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1022+ ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
1023+ ; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm10, %ymm3
1024+ ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm1
1025+ ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1026+ ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
1027+ ; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm10, %ymm4
10261028; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1027- ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
1028- ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
1029- ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1030- ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1
1031- ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm1
1032- ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r8)
1033- ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
1029+ ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
1030+ ; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm10, %ymm2
1031+ ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm0
1032+ ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
1033+ ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%r8)
10341034; AVX512DQ-FCP-NEXT: vzeroupper
10351035; AVX512DQ-FCP-NEXT: retq
10361036;
0 commit comments