Skip to content

Commit cfc6717

Browse files
AntoinePrvserge-sans-paille
authored andcommitted
Remove one permute from swizzle float
1 parent 21d9634 commit cfc6717

File tree

1 file changed

+9
-10
lines changed

1 file changed

+9
-10
lines changed

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1427,20 +1427,19 @@ namespace xsimd
14271427
template <class A>
14281428
XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx>) noexcept
14291429
{
1430-
// duplicate low and high part of input
1431-
// Duplicate lanes separately
1432-
// 1) duplicate low and high lanes
1433-
__m256 lo = _mm256_permute2f128_ps(self, self, 0x00); // [low | low]
1434-
__m256 hi = _mm256_permute2f128_ps(self, self, 0x11); // [high| high]
1430+
// swap lanes
1431+
__m256 swapped = _mm256_permute2f128_ps(self, self, 0x01); // [high | low]
14351432

1436-
// normalize mask
1437-
batch<uint32_t, A> half_mask = mask % 4;
1433+
// normalize mask taking modulo 4
1434+
batch<uint32_t, A> half_mask = mask & 0b11u;
14381435

14391436
// permute within each lane
1440-
__m256 r0 = _mm256_permutevar_ps(lo, half_mask);
1441-
__m256 r1 = _mm256_permutevar_ps(hi, half_mask);
1437+
__m256 r0 = _mm256_permutevar_ps(self, half_mask);
1438+
__m256 r1 = _mm256_permutevar_ps(swapped, half_mask);
14421439

1443-
batch_bool<uint32_t, A> blend_mask = mask >= 4;
1440+
// select lane by the mask index divided by 4
1441+
constexpr auto lane = batch_constant<uint32_t, A, 0, 0, 0, 0, 4, 4, 4, 4> {};
1442+
batch_bool<uint32_t, A> blend_mask = (mask & 0b100u) != lane;
14441443
return _mm256_blendv_ps(r0, r1, batch_bool_cast<float>(blend_mask));
14451444
}
14461445

0 commit comments

Comments
 (0)