@@ -1427,20 +1427,19 @@ namespace xsimd
14271427 template <class A >
14281428 XSIMD_INLINE batch<float , A> swizzle (batch<float , A> const & self, batch<uint32_t , A> mask, requires_arch<avx>) noexcept
14291429 {
1430- // duplicate low and high part of input
1431- // Duplicate lanes separately
1432- // 1) duplicate low and high lanes
1433- __m256 lo = _mm256_permute2f128_ps (self, self, 0x00 ); // [low | low]
1434- __m256 hi = _mm256_permute2f128_ps (self, self, 0x11 ); // [high| high]
1430+ // swap lanes
1431+ __m256 swapped = _mm256_permute2f128_ps (self, self, 0x01 ); // [high | low]
14351432
1436- // normalize mask
1437- batch<uint32_t , A> half_mask = mask % 4 ;
1433+ // normalize mask taking modulo 4
1434+ batch<uint32_t , A> half_mask = mask & 0b11u ;
14381435
14391436 // permute within each lane
1440- __m256 r0 = _mm256_permutevar_ps (lo , half_mask);
1441- __m256 r1 = _mm256_permutevar_ps (hi , half_mask);
1437+ __m256 r0 = _mm256_permutevar_ps (self , half_mask);
1438+ __m256 r1 = _mm256_permutevar_ps (swapped , half_mask);
14421439
1443- batch_bool<uint32_t , A> blend_mask = mask >= 4 ;
1440+ // select lane by the mask index divided by 4
1441+ constexpr auto lane = batch_constant<uint32_t , A, 0 , 0 , 0 , 0 , 4 , 4 , 4 , 4 > {};
1442+ batch_bool<uint32_t , A> blend_mask = (mask & 0b100u ) != lane;
14441443 return _mm256_blendv_ps (r0, r1, batch_bool_cast<float >(blend_mask));
14451444 }
14461445
0 commit comments