Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/source/api/data_transfer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Data transfer
From memory:

+---------------------------------------+----------------------------------------------------+
| :cpp:func:`load` | load values from memory |
| :cpp:func:`load` | load values from memory (optionally masked) |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`load_aligned` | load values from aligned memory |
+---------------------------------------+----------------------------------------------------+
Expand All @@ -30,7 +30,7 @@ From a scalar:
To memory:

+---------------------------------------+----------------------------------------------------+
| :cpp:func:`store` | store values to memory |
| :cpp:func:`store` | store values to memory (optionally masked) |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`store_aligned` | store values to aligned memory |
+---------------------------------------+----------------------------------------------------+
Expand Down
1 change: 1 addition & 0 deletions include/xsimd/arch/common/xsimd_common_arithmetic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <limits>
#include <type_traits>

#include "../../types/xsimd_batch_constant.hpp"
#include "./xsimd_common_details.hpp"

namespace xsimd
Expand Down
97 changes: 97 additions & 0 deletions include/xsimd/arch/common/xsimd_common_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#define XSIMD_COMMON_MEMORY_HPP

#include <algorithm>
#include <array>
#include <complex>
#include <stdexcept>

Expand Down Expand Up @@ -341,6 +342,102 @@ namespace xsimd
return detail::load_unaligned<A>(mem, cvt, common {}, detail::conversion_type<A, T_in, T_out> {});
}

template <class A, class T>
XSIMD_INLINE batch<T, A> load(T const* mem, aligned_mode, requires_arch<A>) noexcept
{
return load_aligned<A>(mem, convert<T> {}, A {});
}

template <class A, class T>
XSIMD_INLINE batch<T, A> load(T const* mem, unaligned_mode, requires_arch<A>) noexcept
{
return load_unaligned<A>(mem, convert<T> {}, A {});
}

template <class A, class T_in, class T_out, bool... Values, class alignment>
XSIMD_INLINE batch<T_out, A>
load_masked(T_in const* mem, batch_bool_constant<T_out, A, Values...>, convert<T_out>, alignment, requires_arch<common>) noexcept
{
constexpr std::size_t size = batch<T_out, A>::size;
alignas(A::alignment()) std::array<T_out, size> buffer {};
constexpr bool mask[size] = { Values... };

for (std::size_t i = 0; i < size; ++i)
buffer[i] = mask[i] ? static_cast<T_out>(mem[i]) : T_out(0);

return batch<T_out, A>::load(buffer.data(), aligned_mode {});
}

template <class A, class T_in, class T_out, bool... Values, class alignment>
XSIMD_INLINE void
store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...>, alignment, requires_arch<common>) noexcept
{
constexpr std::size_t size = batch<T_in, A>::size;
constexpr bool mask[size] = { Values... };

for (std::size_t i = 0; i < size; ++i)
if (mask[i])
{
mem[i] = static_cast<T_out>(src.get(i));
}
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
{
const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});
return bitwise_cast<int32_t>(f);
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<A>) noexcept
{
const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});
return bitwise_cast<uint32_t>(f);
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE typename std::enable_if<types::has_simd_register<double, A>::value, batch<int64_t, A>>::type
load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...>, convert<int64_t>, Mode, requires_arch<A>) noexcept
{
const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});
return bitwise_cast<int64_t>(d);
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE typename std::enable_if<types::has_simd_register<double, A>::value, batch<uint64_t, A>>::type
load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<A>) noexcept
{
const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});
return bitwise_cast<uint64_t>(d);
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...>, Mode, requires_arch<A>) noexcept
{
store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<A>) noexcept
{
store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE typename std::enable_if<types::has_simd_register<double, A>::value, void>::type
store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...>, Mode, requires_arch<A>) noexcept
{
store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE typename std::enable_if<types::has_simd_register<double, A>::value, void>::type
store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<A>) noexcept
{
store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
}

// rotate_right
template <size_t N, class A, class T>
XSIMD_INLINE batch<T, A> rotate_right(batch<T, A> const& self, requires_arch<common>) noexcept
Expand Down
127 changes: 127 additions & 0 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <type_traits>

#include "../types/xsimd_avx_register.hpp"
#include "../types/xsimd_batch_constant.hpp"

namespace xsimd
{
Expand Down Expand Up @@ -871,6 +872,132 @@ namespace xsimd
return _mm256_loadu_pd(mem);
}

// AVX helpers to avoid type-based branching in the generic load_masked
namespace detail
{
template <class A>
XSIMD_INLINE batch<float, A> maskload(float const* mem, batch<as_integer_t<float>, A> const& mask) noexcept
{
return _mm256_maskload_ps(mem, mask);
}

template <class A>
XSIMD_INLINE batch<double, A> maskload(double const* mem, batch<as_integer_t<double>, A> const& mask) noexcept
{
return _mm256_maskload_pd(mem, mask);
}

template <class A>
XSIMD_INLINE batch<float, A> zero_extend(batch<float, A> const& hi) noexcept
{
return _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 1);
}

template <class A>
XSIMD_INLINE batch<double, A> zero_extend(batch<double, A> const& hi) noexcept
{
return _mm256_insertf128_pd(_mm256_setzero_pd(), hi, 1);
}

// allow inserting a 128-bit SSE batch into the upper half of an AVX batch
template <class A, class SrcA>
XSIMD_INLINE batch<float, A> zero_extend(batch<float, SrcA> const& hi) noexcept
{
return _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 1);
}

template <class A, class SrcA>
XSIMD_INLINE batch<double, A> zero_extend(batch<double, SrcA> const& hi) noexcept
{
return _mm256_insertf128_pd(_mm256_setzero_pd(), hi, 1);
}
}

// load_masked (single overload for float/double)
template <class A, class T, bool... Values, class Mode, class = typename std::enable_if<std::is_floating_point<T>::value>::type>
XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx>) noexcept
{
using int_t = as_integer_t<T>;
constexpr size_t half_size = batch<T, A>::size / 2;

XSIMD_IF_CONSTEXPR(mask.none())
{
return batch<T, A>(T { 0 });
}
else XSIMD_IF_CONSTEXPR(mask.all())
{
return load<A>(mem, Mode {});
}
// confined to lower 128-bit half → forward to SSE2
else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
{
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(batch_bool_constant<int_t, A, Values...> {});
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, sse4_2 {});
return bitwise_cast<T>(batch<int_t, A>(_mm256_zextsi128_si256(lo)));
}
// confined to upper 128-bit half → forward to SSE2
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
{
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, sse4_2 {});
return detail::zero_extend<A>(hi);
}
else
{
// crossing 128-bit boundary → use 256-bit masked load
return detail::maskload<A>(mem, mask.as_batch());
}
}

// store_masked
namespace detail
{
template <class A>
XSIMD_INLINE void maskstore(float* mem, batch_bool<float, A> const& mask, batch<float, A> const& src) noexcept
{
_mm256_maskstore_ps(mem, mask, src);
}

template <class A>
XSIMD_INLINE void maskstore(double* mem, batch_bool<double, A> const& mask, batch<double, A> const& src) noexcept
{
_mm256_maskstore_pd(mem, mask, src);
}
}

template <class A, class T, bool... Values, class Mode>
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx>) noexcept
{
constexpr size_t half_size = batch<T, A>::size / 2;

XSIMD_IF_CONSTEXPR(mask.none())
{
return;
}
else XSIMD_IF_CONSTEXPR(mask.all())
{
src.store(mem, Mode {});
}
// confined to lower 128-bit half → forward to SSE2
else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
{
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
const auto lo = detail::lower_half(src);
store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
}
// confined to upper 128-bit half → forward to SSE2
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
{
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
const auto hi = detail::upper_half(src);
store_masked<sse4_2>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});
}
else
{
detail::maskstore(mem, mask.as_batch(), src);
}
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

epiphany : if you express everything in terms of lower / upper half, you're pretty close to a type-generic implementation that would avoid a lot of redunduncy. just need to wrap the actual _mm256_maskstore* and we're good, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess you are right. It might be possible to express things in terms of size and halves. This might be recursive though and we might need to arch to half arch, similarly to what happens in widen.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll also have to think if the size of the element matters. I'll have a look next week

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason why I kept the implementation separate is as follows:

The general implementation in terms of upper/lower ais possible. This haves the code in avx, avx2 still has to provide a new implementation so that it can take advantage of the new intrinsics so we for from 2 versions in avx down to 1 and 2 in avx2 down to 1. Same for avx512.

But then for merging we can not use more optimized intrinsics than _mm256_castps128_ps256. Is it worth it?

The solution here is to pass the MaskLoad/Store and the merge function as templates to this general implementation. Then we could have 1 general implementation maybe for all the kernels. But, I think it might be too much templating.

In general also wrapping _mm256_maskstore* also makes a clean way to expose this to the used if we wish to do so.

What do you suggest?


// lt
template <class A>
XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
Expand Down
Loading