-
Notifications
You must be signed in to change notification settings - Fork 281
Proposal load/store masked #1162
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
| #include <type_traits> | ||
|
|
||
| #include "../types/xsimd_avx_register.hpp" | ||
| #include "../types/xsimd_batch_constant.hpp" | ||
|
|
||
| namespace xsimd | ||
| { | ||
|
|
@@ -871,6 +872,132 @@ namespace xsimd | |
| return _mm256_loadu_pd(mem); | ||
| } | ||
|
|
||
| // AVX helpers to avoid type-based branching in the generic load_masked | ||
| namespace detail | ||
| { | ||
| template <class A> | ||
| XSIMD_INLINE batch<float, A> maskload(float const* mem, batch<as_integer_t<float>, A> const& mask) noexcept | ||
| { | ||
| return _mm256_maskload_ps(mem, mask); | ||
| } | ||
|
|
||
| template <class A> | ||
| XSIMD_INLINE batch<double, A> maskload(double const* mem, batch<as_integer_t<double>, A> const& mask) noexcept | ||
| { | ||
| return _mm256_maskload_pd(mem, mask); | ||
| } | ||
|
|
||
| template <class A> | ||
| XSIMD_INLINE batch<float, A> zero_extend(batch<float, A> const& hi) noexcept | ||
| { | ||
| return _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 1); | ||
| } | ||
|
|
||
| template <class A> | ||
| XSIMD_INLINE batch<double, A> zero_extend(batch<double, A> const& hi) noexcept | ||
| { | ||
| return _mm256_insertf128_pd(_mm256_setzero_pd(), hi, 1); | ||
| } | ||
|
|
||
| // allow inserting a 128-bit SSE batch into the upper half of an AVX batch | ||
| template <class A, class SrcA> | ||
| XSIMD_INLINE batch<float, A> zero_extend(batch<float, SrcA> const& hi) noexcept | ||
| { | ||
| return _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 1); | ||
| } | ||
|
|
||
| template <class A, class SrcA> | ||
| XSIMD_INLINE batch<double, A> zero_extend(batch<double, SrcA> const& hi) noexcept | ||
| { | ||
| return _mm256_insertf128_pd(_mm256_setzero_pd(), hi, 1); | ||
| } | ||
| } | ||
|
|
||
| // load_masked (single overload for float/double) | ||
| template <class A, class T, bool... Values, class Mode, class = typename std::enable_if<std::is_floating_point<T>::value>::type> | ||
| XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx>) noexcept | ||
| { | ||
| using int_t = as_integer_t<T>; | ||
| constexpr size_t half_size = batch<T, A>::size / 2; | ||
|
|
||
| XSIMD_IF_CONSTEXPR(mask.none()) | ||
| { | ||
| return batch<T, A>(T { 0 }); | ||
| } | ||
| else XSIMD_IF_CONSTEXPR(mask.all()) | ||
| { | ||
| return load<A>(mem, Mode {}); | ||
| } | ||
| // confined to lower 128-bit half → forward to SSE2 | ||
| else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) | ||
| { | ||
| constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(batch_bool_constant<int_t, A, Values...> {}); | ||
| const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, sse4_2 {}); | ||
| return bitwise_cast<T>(batch<int_t, A>(_mm256_zextsi128_si256(lo))); | ||
| } | ||
| // confined to upper 128-bit half → forward to SSE2 | ||
| else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) | ||
| { | ||
| constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask); | ||
| const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, sse4_2 {}); | ||
| return detail::zero_extend<A>(hi); | ||
| } | ||
| else | ||
| { | ||
| // crossing 128-bit boundary → use 256-bit masked load | ||
| return detail::maskload<A>(mem, mask.as_batch()); | ||
| } | ||
| } | ||
|
|
||
| // store_masked | ||
| namespace detail | ||
| { | ||
| template <class A> | ||
| XSIMD_INLINE void maskstore(float* mem, batch_bool<float, A> const& mask, batch<float, A> const& src) noexcept | ||
| { | ||
| _mm256_maskstore_ps(mem, mask, src); | ||
| } | ||
|
|
||
| template <class A> | ||
| XSIMD_INLINE void maskstore(double* mem, batch_bool<double, A> const& mask, batch<double, A> const& src) noexcept | ||
| { | ||
| _mm256_maskstore_pd(mem, mask, src); | ||
| } | ||
| } | ||
|
|
||
| template <class A, class T, bool... Values, class Mode> | ||
| XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx>) noexcept | ||
| { | ||
| constexpr size_t half_size = batch<T, A>::size / 2; | ||
|
|
||
| XSIMD_IF_CONSTEXPR(mask.none()) | ||
| { | ||
| return; | ||
| } | ||
| else XSIMD_IF_CONSTEXPR(mask.all()) | ||
| { | ||
| src.store(mem, Mode {}); | ||
| } | ||
| // confined to lower 128-bit half → forward to SSE2 | ||
| else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) | ||
| { | ||
| constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask); | ||
| const auto lo = detail::lower_half(src); | ||
| store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {}); | ||
| } | ||
| // confined to upper 128-bit half → forward to SSE2 | ||
| else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) | ||
| { | ||
| constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask); | ||
| const auto hi = detail::upper_half(src); | ||
| store_masked<sse4_2>(mem + half_size, hi, mhi, Mode {}, sse4_2 {}); | ||
| } | ||
| else | ||
| { | ||
| detail::maskstore(mem, mask.as_batch(), src); | ||
| } | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. epiphany : if you express everything in terms of lower / upper half, you're pretty close to a type-generic implementation that would avoid a lot of redunduncy. just need to wrap the actual
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess you are right. It might be possible to express things in terms of size and halves. This might be recursive though and we might need to arch to half arch, similarly to what happens in widen.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll also have to think if the size of the element matters. I'll have a look next week
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reason why I kept the implementation separate is as follows: The general implementation in terms of upper/lower ais possible. This haves the code in avx, avx2 still has to provide a new implementation so that it can take advantage of the new intrinsics so we for from 2 versions in avx down to 1 and 2 in avx2 down to 1. Same for avx512. But then for merging we can not use more optimized intrinsics than _mm256_castps128_ps256. Is it worth it? The solution here is to pass the MaskLoad/Store and the merge function as templates to this general implementation. Then we could have 1 general implementation maybe for all the kernels. But, I think it might be too much templating. In general also wrapping _mm256_maskstore* also makes a clean way to expose this to the used if we wish to do so. What do you suggest? |
||
|
|
||
| // lt | ||
| template <class A> | ||
| XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.