Skip to content
Open
75 changes: 33 additions & 42 deletions .github/workflows/c-cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,37 @@ on:
permissions: read-all

jobs:
build-windows-msvc:
runs-on: windows-latest

steps:
- uses: actions/checkout@v4

# Set up MSVC environment
- name: Set up MSVC Developer Command Prompt
uses: ilammy/msvc-dev-cmd@v1
with:
arch: x64

# Install Python (Meson requires it)
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'

# Install Meson and Ninja
- name: Install Meson + Ninja
run: |
python -m pip install --upgrade pip
pip install meson ninja

# Configure and build with Meson (MSVC will be used automatically)
- name: Configure (Meson)
run: meson setup --warnlevel 2 --buildtype release builddir --backend=ninja

- name: Build (Ninja)
run: ninja -C builddir

SKL-gcc9:

runs-on: intel-ubuntu-24.04
Expand Down Expand Up @@ -135,44 +166,7 @@ jobs:
- name: Run test suite on SPR
run: sde -spr -- ./builddir/testexe

ADL-ASAN-clang18:

runs-on: intel-ubuntu-24.04

steps:
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

- name: Install dependencies
run: |
sudo apt update
sudo apt -y install clang-18 libomp-18-dev libgtest-dev meson curl git

- name: Install Intel SDE
run: |
curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz
mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde

- name: Build examples
env:
CXX: clang++-18
run: |
cd examples
make all

- name: Build
env:
CXX: clang++-18
run: |
make clean
meson setup -Dbuild_tests=true -Duse_openmp=true -Db_sanitize=address,undefined -Dfatal_sanitizers=true -Dasan_ci_dont_validate=true -Db_lundef=false --warnlevel 0 --buildtype release builddir
cd builddir
ninja

- name: Run test suite on SPR
run: sde -adl -- ./builddir/testexe

SPR-ASAN-clang18:
ASAN-clang18:

runs-on: intel-ubuntu-24.04

Expand Down Expand Up @@ -207,10 +201,7 @@ jobs:
ninja

- name: Run test suite on SPR
run: sde -spr -- ./builddir/testexe
- name: Run ICL fp16 tests
# Note: This filters for the _Float16 tests based on the number assigned to it, which could change in the future
run: sde -icx -- ./builddir/testexe --gtest_filter="*/simdsort/2*"
run: ./builddir/testexe

SKX-SKL-openmp:

Expand Down
12 changes: 6 additions & 6 deletions benchmarks/bench.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
#define MY_BENCHMARK_CAPTURE(func, T, test_case_name, ...) \
BENCHMARK_PRIVATE_DECLARE(func) \
= (::benchmark::internal::RegisterBenchmarkInternal( \
std::unique_ptr<benchmark::internal::Benchmark>( \
new ::benchmark::internal::FunctionBenchmark( \
#func "/" #test_case_name "/" #T, \
[](::benchmark::State &st) { \
func<T>(st, __VA_ARGS__); \
}))))
std::unique_ptr<benchmark::internal::Benchmark>( \
new ::benchmark::internal::FunctionBenchmark( \
#func "/" #test_case_name "/" #T, \
[](::benchmark::State &st) { \
func<T>(st, __VA_ARGS__); \
}))))

#define BENCH_SORT(func, type) \
MY_BENCHMARK_CAPTURE(func, type, random_128, 128, std::string("random")); \
Expand Down
63 changes: 28 additions & 35 deletions lib/meson.build
Original file line number Diff line number Diff line change
@@ -1,48 +1,41 @@
libtargets = []
libtargets += static_library('libavx',
files(
'x86simdsort-avx2.cpp',
),
include_directories : [src],
cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX2'] : ['-march=haswell'],
gnu_symbol_visibility : 'inlineshidden',
dependencies: [omp_dep],
)

if cpp.has_argument('-march=haswell')
libtargets += static_library('libavx',
files(
'x86simdsort-avx2.cpp',
),
include_directories : [src],
cpp_args : ['-march=haswell'],
gnu_symbol_visibility : 'inlineshidden',
dependencies: [omp_dep],
)
endif

if cpp.has_argument('-march=skylake-avx512')
libtargets += static_library('libskx',
files(
'x86simdsort-skx.cpp',
),
include_directories : [src],
cpp_args : ['-march=skylake-avx512'],
gnu_symbol_visibility : 'inlineshidden',
dependencies: [omp_dep],
)
endif
libtargets += static_library('libskx',
files(
'x86simdsort-skx.cpp',
),
include_directories : [src],
cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=skylake-avx512'],
gnu_symbol_visibility : 'inlineshidden',
dependencies: [omp_dep],
)

if cpp.has_argument('-march=icelake-client')
libtargets += static_library('libicl',
files(
'x86simdsort-icl.cpp',
),
include_directories : [src],
cpp_args : ['-march=icelake-client'],
gnu_symbol_visibility : 'inlineshidden',
dependencies: [omp_dep],
)
endif
libtargets += static_library('libicl',
files(
'x86simdsort-icl.cpp',
),
include_directories : [src],
cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=icelake-client'],
gnu_symbol_visibility : 'inlineshidden',
dependencies: [omp_dep],
)

if cancompilefp16
libtargets += static_library('libspr',
files(
'x86simdsort-spr.cpp',
),
include_directories : [src],
cpp_args : ['-march=sapphirerapids'],
cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=sapphirerapids'],
gnu_symbol_visibility : 'inlineshidden',
dependencies: [omp_dep],
)
Expand Down
4 changes: 4 additions & 0 deletions lib/x86simdsort-icl.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
// ICL specific routines:
#include "x86simdsort-static-incl.h"
#include "x86simdsort-internal.h"
#ifdef _MSC_VER
#include "avx512-16bit-qsort.hpp"
#endif


namespace xss {
namespace avx512 {
Expand Down
34 changes: 18 additions & 16 deletions lib/x86simdsort.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
#if defined(_MSC_VER)
#define XSS_ATTRIBUTE_CONSTRUCTOR
#else
#define XSS_ATTRIBUTE_CONSTRUCTOR __attribute__((constructor))
#endif
#include "x86simdsort.h"
#include "x86simdsort-internal.h"
#include "x86simdsort-scalar.h"
#include "x86simdsortcpuid.h"
#include <algorithm>
#include <iostream>
#include <string>
Expand All @@ -12,23 +18,19 @@ static int check_cpu_feature_support(std::string_view cpufeature)
if ((cpufeature == "avx512_spr") && (!disable_avx512))
#if defined(__FLT16_MAX__) && !defined(__INTEL_LLVM_COMPILER) \
&& (!defined(__clang_major__) || __clang_major__ >= 18)
return __builtin_cpu_supports("avx512f")
&& __builtin_cpu_supports("avx512fp16")
&& __builtin_cpu_supports("avx512vbmi2");
return xss_cpu_supports("avx512f") && xss_cpu_supports("avx512fp16")
&& xss_cpu_supports("avx512vbmi2");
#else
return 0;
#endif
else if ((cpufeature == "avx512_icl") && (!disable_avx512))
return __builtin_cpu_supports("avx512f")
&& __builtin_cpu_supports("avx512vbmi2")
&& __builtin_cpu_supports("avx512bw")
&& __builtin_cpu_supports("avx512vl");
return xss_cpu_supports("avx512f") && xss_cpu_supports("avx512vbmi2")
&& xss_cpu_supports("avx512bw") && xss_cpu_supports("avx512vl");
else if ((cpufeature == "avx512_skx") && (!disable_avx512))
return __builtin_cpu_supports("avx512f")
&& __builtin_cpu_supports("avx512dq")
&& __builtin_cpu_supports("avx512vl");
return xss_cpu_supports("avx512f") && xss_cpu_supports("avx512dq")
&& xss_cpu_supports("avx512vl");
else if (cpufeature == "avx2")
return __builtin_cpu_supports("avx2");
return xss_cpu_supports("avx2");

return 0;
}
Expand Down Expand Up @@ -121,11 +123,11 @@ constexpr bool IS_TYPE_FLOAT16()

/* runtime dispatch mechanism */
#define DISPATCH(func, TYPE, ISA) \
DECLARE_INTERNAL_##func(TYPE) static __attribute__((constructor)) void \
CAT(CAT(resolve_, func), TYPE)(void) \
DECLARE_INTERNAL_##func(TYPE) static XSS_ATTRIBUTE_CONSTRUCTOR void CAT( \
CAT(resolve_, func), TYPE)(void) \
{ \
CAT(CAT(internal_, func), TYPE) = &xss::scalar::func<TYPE>; \
__builtin_cpu_init(); \
xss_cpu_init(); \
std::string_view preferred_cpu = find_preferred_cpu(ISA); \
if constexpr (dispatch_requested("avx512", ISA)) { \
if (preferred_cpu.find("avx512") != std::string_view::npos) { \
Expand Down Expand Up @@ -248,12 +250,12 @@ DISPATCH_ALL(argselect,
}

#define DISPATCH_KV_FUNC(func, TYPE1, TYPE2, ISA) \
static __attribute__((constructor)) void CAT( \
static XSS_ATTRIBUTE_CONSTRUCTOR void CAT( \
CAT(CAT(CAT(resolve_, func), _), TYPE1), TYPE2)(void) \
{ \
CAT(CAT(CAT(CAT(internal_, func), _), TYPE1), TYPE2) \
= &xss::scalar::func<TYPE1, TYPE2>; \
__builtin_cpu_init(); \
xss_cpu_init(); \
std::string_view preferred_cpu = find_preferred_cpu(ISA); \
if constexpr (dispatch_requested("avx512", ISA)) { \
if (preferred_cpu.find("avx512") != std::string_view::npos) { \
Expand Down
14 changes: 11 additions & 3 deletions lib/x86simdsort.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,13 @@
#include <functional>
#include <numeric>

#if defined(_MSC_VER)
#define XSS_EXPORT_SYMBOL __declspec(dllexport)
#define XSS_HIDE_SYMBOL
#else
#define XSS_EXPORT_SYMBOL __attribute__((visibility("default")))
#define XSS_HIDE_SYMBOL __attribute__((visibility("hidden")))
#endif
#define UNUSED(x) (void)(x)

namespace x86simdsort {
Expand Down Expand Up @@ -73,11 +78,14 @@ XSS_EXPORT_SYMBOL void keyvalue_partial_sort(T1 *key,
template <typename T, typename U, typename Func>
XSS_EXPORT_SYMBOL void object_qsort(T *arr, U arrsize, Func key_func)
{
static_assert(std::is_integral<U>::value, "arrsize must be an integral type");
static_assert(std::is_integral<U>::value,
"arrsize must be an integral type");
static_assert(sizeof(U) == sizeof(int32_t) || sizeof(U) == sizeof(int64_t),
"arrsize must be 32 or 64 bits");
using return_type_of = typename decltype(std::function{key_func})::result_type;
static_assert(sizeof(return_type_of) == sizeof(int32_t) || sizeof(return_type_of) == sizeof(int64_t),
using return_type_of =
typename decltype(std::function {key_func})::result_type;
static_assert(sizeof(return_type_of) == sizeof(int32_t)
|| sizeof(return_type_of) == sizeof(int64_t),
"key_func return type must be 32 or 64 bits");
std::vector<return_type_of> keys(arrsize);
for (U ii = 0; ii < arrsize; ++ii) {
Expand Down
48 changes: 48 additions & 0 deletions lib/x86simdsortcpuid.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#ifndef X86SIMDSORT_CPUID_H
#define X86SIMDSORT_CPUID_H

#ifdef _MSC_VER
#include <intrin.h>
#include <string>
#include <unordered_map>

static std::unordered_map<std::string, bool> xss_cpu_features;

inline void xss_cpu_init()
{
int cpuInfo[4] = {0};
// Check AVX2
__cpuid(cpuInfo, 0);
int nIds = cpuInfo[0];
__cpuid(cpuInfo, 1);
bool osxsave = (cpuInfo[2] & (1 << 27)) != 0;
bool avx = (cpuInfo[2] & (1 << 28)) != 0;
__cpuid(cpuInfo, 7);
bool avx2 = (cpuInfo[1] & (1 << 5)) != 0;
bool avx512f = (cpuInfo[1] & (1 << 16)) != 0;
bool avx512dq = (cpuInfo[1] & (1 << 17)) != 0;
bool avx512bw = (cpuInfo[1] & (1 << 30)) != 0;
bool avx512vl = (cpuInfo[1] & (1 << 31)) != 0;
bool avx512vbmi2 = (cpuInfo[2] & (1 << 6)) != 0;
bool avx512fp16 = (cpuInfo[3] & (1 << 23)) != 0;
// Store results
xss_cpu_features["avx2"] = avx2;
xss_cpu_features["avx512f"] = avx512f;
xss_cpu_features["avx512dq"] = avx512dq;
xss_cpu_features["avx512bw"] = avx512bw;
xss_cpu_features["avx512vl"] = avx512vl;
xss_cpu_features["avx512vbmi2"] = avx512vbmi2;
xss_cpu_features["avx512fp16"] = avx512fp16;
}

inline bool xss_cpu_supports(const char *feature)
{
auto it = xss_cpu_features.find(feature);
return it != xss_cpu_features.end() && it->second;
}

#else
#define xss_cpu_init() __builtin_cpu_init()
#define xss_cpu_supports(feature) __builtin_cpu_supports(feature)
#endif // _MSC_VER
#endif // X86SIMDSORT_CPUID_H
9 changes: 9 additions & 0 deletions meson.build
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

project('x86-simd-sort', 'cpp',
version : '7.0.x',
license : 'BSD 3-clause',
Expand All @@ -10,6 +11,13 @@ bench = include_directories('benchmarks')
utils = include_directories('utils')
tests = include_directories('tests')

# check if compiler supports -march=haswell, -march=skylake-avx512 and -march=icelake-client and error out if not
if cpp.get_id() != 'msvc'
if not cpp.has_argument('-march=haswell') or not cpp.has_argument('-march=skylake-avx512') or not cpp.has_argument('-march=icelake-client')
error('Compiler does not support -march=haswell, -march=skylake-avx512 or -march=icelake-client. Please use a newer compiler version.')
endif
endif

# Add IPP sort to benchmarks:
benchipp = false
ipplink = []
Expand Down Expand Up @@ -37,6 +45,7 @@ if get_option('use_openmp')
omp_dep = declare_dependency(dependencies: omp, compile_args: ['-DXSS_USE_OPENMP'])
endif


fp16code = '''#include<immintrin.h>
int main() {
__m512h temp = _mm512_set1_ph(1.0f);
Expand Down
Loading