@@ -1344,6 +1344,52 @@ namespace xsimd
13441344 return first (acc3, A {});
13451345 }
13461346
1347+ // reduce_mul
1348+ template <class A >
1349+ XSIMD_INLINE float reduce_mul (batch<float , A> const & self, requires_arch<sse2>) noexcept
1350+ {
1351+ __m128 tmp0 = _mm_mul_ps (self, _mm_movehl_ps (self, self));
1352+ __m128 tmp1 = _mm_mul_ss (tmp0, _mm_shuffle_ps (tmp0, tmp0, 1 ));
1353+ return _mm_cvtss_f32 (tmp1);
1354+ }
1355+
1356+ template <class A >
1357+ XSIMD_INLINE double reduce_mul (batch<double , A> const & self, requires_arch<sse2>) noexcept
1358+ {
1359+ return _mm_cvtsd_f64 (_mm_mul_sd (self, _mm_unpackhi_pd (self, self)));
1360+ }
1361+
1362+ template <class A , class T , class = typename std::enable_if<std::is_integral<T>::value, void >::type>
1363+ XSIMD_INLINE T reduce_mul (batch<T, A> const & self, requires_arch<sse2>) noexcept
1364+ {
1365+ XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
1366+ {
1367+ batch<T, A> tmp1 = _mm_shuffle_epi32 (self, _MM_SHUFFLE (0 , 1 , 2 , 3 ));
1368+ tmp1 = tmp1 * self;
1369+ batch<T, A> tmp2 = _mm_unpackhi_epi32 (tmp1, tmp1);
1370+ tmp2 = tmp2 * tmp1;
1371+ return _mm_cvtsi128_si32 (tmp2);
1372+ }
1373+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 )
1374+ {
1375+ batch<T, A> tmp1 = _mm_unpackhi_epi64 (self, self);
1376+ auto tmp2 = tmp1 * self;
1377+ #if defined(__x86_64__)
1378+ return _mm_cvtsi128_si64 (tmp2);
1379+ #else
1380+ __m128i m;
1381+ _mm_storel_epi64 (&m, tmp2);
1382+ int64_t i;
1383+ std::memcpy (&i, &m, sizeof (i));
1384+ return i;
1385+ #endif
1386+ }
1387+ else
1388+ {
1389+ return reduce_mul (self, common {});
1390+ }
1391+ }
1392+
13471393 // rsqrt
13481394 template <class A >
13491395 XSIMD_INLINE batch<float , A> rsqrt (batch<float , A> const & val, requires_arch<sse2>) noexcept
0 commit comments