@@ -31,6 +31,14 @@ namespace xsimd {
3131 inline __m256i merge_sse (__m128i low, __m128i high) {
3232 return _mm256_insertf128_si256 (_mm256_castsi128_si256 (low), high, 1 );
3333 }
34+ template <class F >
35+ __m256i fwd_to_sse (F f, __m256i self) {
36+ __m128i self_low, self_high;
37+ split_avx (self, self_low, self_high);
38+ __m128i res_low = f (self_low);
39+ __m128i res_high = f (self_high);
40+ return merge_sse (res_low, res_high);
41+ }
3442 template <class F >
3543 __m256i fwd_to_sse (F f, __m256i self, __m256i other) {
3644 __m128i self_low, self_high, other_low, other_high;
@@ -63,13 +71,7 @@ namespace xsimd {
6371 // add
6472 template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
6573 batch<T, A> add (batch<T, A> const & self, batch<T, A> const & other, requires_arch<avx>) {
66- switch (sizeof (T)) {
67- case 1 : return _mm256_add_epi8 (self, other);
68- case 2 : return _mm256_add_epi16 (self, other);
69- case 4 : return detail::fwd_to_sse ([](__m128i s, __m128i o) { return add (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, self, other);
70- case 8 : return detail::fwd_to_sse ([](__m128i s, __m128i o) { return add (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, self, other);
71- default : assert (false && " unsupported arch/op combination" ); return {};
72- }
74+ return detail::fwd_to_sse ([](__m128i s, __m128i o) { return add (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, self, other);
7375 }
7476 template <class A > batch<float , A> add (batch<float , A> const & self, batch<float , A> const & other, requires_arch<avx>) {
7577 return _mm256_add_ps (self, other);
@@ -153,24 +155,17 @@ namespace xsimd {
153155 // bitwise_lshift
154156 template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
155157 batch<T, A> bitwise_lshift (batch<T, A> const & self, int32_t other, requires_arch<avx>) {
156- switch (sizeof (T)) {
157- case 1 : return detail::fwd_to_sse ([](__m128i s, __m128i o) { return bitwise_and (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, _mm256_set1_epi8 (0xFF << other), _mm256_slli_epi32 (self, other));
158-
159- case 2 : return _mm256_slli_epi16 (self, other);
160- case 4 : return detail::fwd_to_sse ([](__m128i s, int32_t o) { return bitwise_lshift (batch<T, sse4_2>(s), o, sse4_2{}); },self, other);
161- case 8 : return _mm256_slli_epi64 (self, other);
162- default : assert (false && " unsupported arch/op combination" ); return {};
163- }
158+ return detail::fwd_to_sse ([](__m128i s, int32_t o) { return bitwise_lshift (batch<T, sse4_2>(s), o, sse4_2{}); },self, other);
164159 }
165160
166161 // bitwise_not
167162 template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
168163 batch<T, A> bitwise_not (batch<T, A> const & self, requires_arch<avx>) {
169- return _mm256_xor_si256 (self, _mm256_set1_epi32 (- 1 ) );
164+ return detail::fwd_to_sse ([](__m128i s) { return bitwise_not (batch<T, sse4_2>(s), sse4_2{}); }, self );
170165 }
171166 template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
172167 batch_bool<T, A> bitwise_not (batch_bool<T, A> const & self, requires_arch<avx>) {
173- return _mm256_xor_si256 (self, _mm256_set1_epi32 (- 1 ) );
168+ return detail::fwd_to_sse ([](__m128i s) { return bitwise_not (batch_bool<T, sse4_2>(s), sse4_2{}); }, self );
174169 }
175170
176171 // bitwise_or
@@ -188,48 +183,17 @@ namespace xsimd {
188183 }
189184 template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
190185 batch<T, A> bitwise_or (batch<T, A> const & self, batch<T, A> const & other, requires_arch<avx>) {
191- return _mm256_or_si256 ( self, other);
186+ return detail::fwd_to_sse ([](__m128i s, __m128i o) { return bitwise_or (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, self, other);
192187 }
193188 template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
194189 batch_bool<T, A> bitwise_or (batch_bool<T, A> const & self, batch_bool<T, A> const & other, requires_arch<avx>) {
195- return _mm256_or_si256 ( self, other);
190+ return detail::fwd_to_sse ([](__m128i s, __m128i o) { return bitwise_or (batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o)); }, self, other);
196191 }
197192
198193 // bitwise_rshift
199194 template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
200195 batch<T, A> bitwise_rshift (batch<T, A> const & self, int32_t other, requires_arch<avx>) {
201- if (std::is_signed<T>::value) {
202- switch (sizeof (T)) {
203- case 1 : {
204- __m256i sign_mask = _mm256_set1_epi16 ((0xFF00 >> other) & 0x00FF );
205- __m256i cmp_is_negative = _mm256_cmpgt_epi8 (_mm256_setzero_si256 (), self);
206- __m256i res = _mm256_srai_epi16 (self, other);
207- return _mm256_or_si256 (
208- detail::fwd_to_sse ([](__m128i s, __m128i o) { return bitwise_and (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, sign_mask, cmp_is_negative),
209- _mm256_andnot_si256 (sign_mask, res));
210- }
211- case 2 : return _mm256_srai_epi16 (self, other);
212- case 4 : return detail::fwd_to_sse ([](__m128i s, int32_t o) { return bitwise_rshift (batch<T, sse4_2>(s), o, sse4_2{}); }, self, other);
213- case 8 : {
214- // from https://github.com/samyvilar/vect/blob/master/vect_128.h
215- return _mm256_or_si256 (
216- _mm256_srli_epi64 (self, other),
217- _mm256_slli_epi64 (
218- detail::fwd_to_sse ([](__m128i s, int32_t o) { return _mm_srai_epi32 (s, o); }, _mm256_shuffle_epi32 (self, _MM_SHUFFLE (3 , 3 , 1 , 1 )), 32 ),
219- 64 - other));
220- }
221- default : assert (false && " unsupported arch/op combination" ); return {};
222- }
223- }
224- else {
225- switch (sizeof (T)) {
226- case 1 : return detail::fwd_to_sse ([](__m128i s, __m128i o) { return bitwise_and (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, _mm256_set1_epi8 (0xFF >> other), _mm256_srli_epi32 (self, other));
227- case 2 : return _mm256_srli_epi16 (self, other);
228- case 4 : return _mm256_srli_epi32 (self, other);
229- case 8 : return _mm256_srli_epi64 (self, other);
230- default : assert (false && " unsupported arch/op combination" ); return {};
231- }
232- }
196+ return detail::fwd_to_sse ([](__m128i s, int32_t o) { return bitwise_rshift (batch<T, sse4_2>(s), o, sse4_2{}); }, self, other);
233197 }
234198
235199 // bitwise_xor
@@ -247,8 +211,15 @@ namespace xsimd {
247211 }
248212 template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
249213 batch<T, A> bitwise_xor (batch<T, A> const & self, batch<T, A> const & other, requires_arch<avx>) {
250- return _mm256_xor_si256 (self, other);
214+ return detail::fwd_to_sse ([](__m128i s, __m128i o) { return bitwise_xor (batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2{}); },
215+ self, other);
251216 }
217+ template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
218+ batch<T, A> bitwise_xor (batch_bool<T, A> const & self, batch_bool<T, A> const & other, requires_arch<avx>) {
219+ return detail::fwd_to_sse ([](__m128i s, __m128i o) { return bitwise_xor (batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o), sse4_2{}); },
220+ self, other);
221+ }
222+
252223 // bitwise_cast
253224 template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
254225 batch<float , A> bitwise_cast (batch<T, A> const & self, batch<float , A> const &, requires_arch<avx>) {
@@ -414,20 +385,9 @@ namespace xsimd {
414385 }
415386 template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
416387 batch_bool<T, A> eq (batch<T, A> const & self, batch<T, A> const & other, requires_arch<avx>) {
417- switch (sizeof (T)) {
418- case 1 : return _mm256_cmpeq_epi8 (self, other);
419- case 2 : return _mm256_cmpeq_epi16 (self, other);
420- case 4 : return detail::fwd_to_sse ([](__m128i s, __m128i o) { return eq (batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2{}); },self, other);
421- case 8 : {
422- __m256i tmp1 = detail::fwd_to_sse ([](__m128i s, __m128i o) { return eq (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },self, other);
423- __m256i tmp2 = _mm256_shuffle_epi32 (tmp1, 0xB1 );
424- __m256i tmp3 = detail::fwd_to_sse ([](__m128i s, __m128i o) { return bitwise_and (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, tmp1, tmp2);
425- __m256i tmp4 = detail::fwd_to_sse ([](__m128i s, uint32_t o) { return _mm_srai_epi32 (s, o); }, tmp3, 31 );
426- return _mm256_shuffle_epi32 (tmp4, 0xF5 );
427- }
428- default : assert (false && " unsupported arch/op combination" ); return {};
429- }
388+ return detail::fwd_to_sse ([](__m128i s, __m128i o) { return eq (batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2{}); },self, other);
430389 }
390+
431391 template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
432392 batch_bool<T, A> eq (batch_bool<T, A> const & self, batch_bool<T, A> const & other, requires_arch<avx>) {
433393 return eq (batch<T, A>(self.data ), batch<T, A>(other.data ));
@@ -868,13 +828,7 @@ namespace xsimd {
868828 // sub
869829 template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
870830 batch<T, A> sub (batch<T, A> const & self, batch<T, A> const & other, requires_arch<avx>) {
871- switch (sizeof (T)) {
872- case 1 : return _mm256_sub_epi8 (self, other);
873- case 2 : return _mm256_sub_epi16 (self, other);
874- case 4 : return detail::fwd_to_sse ([](__m128i s, __m128i o) { return sub (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, self, other);
875- case 8 : return _mm256_sub_epi64 (self, other);
876- default : assert (false && " unsupported arch/op combination" ); return {};
877- }
831+ return detail::fwd_to_sse ([](__m128i s, __m128i o) { return sub (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, self, other);
878832 }
879833 template <class A > batch<float , A> sub (batch<float , A> const & self, batch<float , A> const & other, requires_arch<avx>) {
880834 return _mm256_sub_ps (self, other);
0 commit comments