From 40d147a008eb97fcd8472268dd7b54387033bea1 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Mon, 3 Nov 2025 20:21:04 -0800 Subject: [PATCH 01/28] Basic framework for WaveActive ops. --- .../unittests/HLSLExec/LongVectorTestData.h | 5 ++- .../clang/unittests/HLSLExec/LongVectors.cpp | 41 +++++++++++++++++++ .../unittests/HLSLExec/ShaderOpArith.xml | 1 - 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectorTestData.h b/tools/clang/unittests/HLSLExec/LongVectorTestData.h index 81b9d2cfef..9c9cd58137 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorTestData.h +++ b/tools/clang/unittests/HLSLExec/LongVectorTestData.h @@ -349,8 +349,9 @@ INPUT_SET(InputSet::SelectCond, 0, 1); END_INPUT_SETS() BEGIN_INPUT_SETS(HLSLHalf_t) -INPUT_SET(InputSet::Default1, -1.0, -1.0, 1.0, -0.01, 1.0, -0.01, 1.0, -0.01, - 1.0, -0.01); +INPUT_SET(InputSet::Default1, -1.0, 1.0); +// INPUT_SET(InputSet::Default1, -1.0, -1.0, 1.0, -0.01, 1.0, -0.01, 1.0, -0.01, +// 1.0, -0.01); INPUT_SET(InputSet::Default2, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); INPUT_SET(InputSet::Default3, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index 5a9f5d6f70..5ccd441e35 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1379,6 +1379,47 @@ void dispatchTest(ID3D12Device *D3DDevice, bool VerboseLogging, } } +static bool isWarpDevice(ID3D12Device *D3DDevice) { + DXASSERT_NOMSG(D3DDevice != nullptr); + + // Get the adapter LUID from the device + LUID AdapterLuid = D3DDevice->GetAdapterLuid(); + + // Create a DXGI factory to enumerate adapters + CComPtr DXGIFactory; + HRESULT HR = CreateDXGIFactory1(IID_PPV_ARGS(&DXGIFactory)); + if (FAILED(HR)) { + hlsl_test::LogCommentFmt( + L"isWarpDevice: Failed to create DXGI factory, HR=0x%08x", HR); + return false; + } + + // Get the adapter by LUID + CComPtr DXGIAdapter; + HR = DXGIFactory->EnumAdapterByLuid(AdapterLuid, IID_PPV_ARGS(&DXGIAdapter)); + if (FAILED(HR) || !DXGIAdapter) { + hlsl_test::LogCommentFmt( + L"isWarpDevice: Failed to enumerate adapter by LUID, HR=0x%08x", HR); + return false; + } + + DXGI_ADAPTER_DESC1 Desc{}; + HR = DXGIAdapter->GetDesc1(&Desc); + if (FAILED(HR)) { + hlsl_test::LogCommentFmt( + L"isWarpDevice: Failed to get adapter description, HR=0x%08x", HR); + return false; + } + + // Check for WARP adapter (VendorId 0x1414, DeviceId 0x8c) + const bool IsWarp = (Desc.VendorId == 0x1414 && Desc.DeviceId == 0x8c); + hlsl_test::LogCommentFmt( + L"isWarpDevice: VendorId=0x%04x, DeviceId=0x%04x, IsWarp=%d", + Desc.VendorId, Desc.DeviceId, IsWarp); + + return IsWarp; +} + template void dispatchWaveOpTest(ID3D12Device *D3DDevice, bool VerboseLogging, size_t OverrideInputSize, UINT WaveSize) { diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index b48e94bf3d..99e1b14fa3 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4141,7 +4141,6 @@ void MSMain(uint GID : SV_GroupIndex, #else const uint32_t OutNum = NUM; #endif - #if IS_UNARY_OP vector OutputVector = FUNC(Input1); #elif IS_BINARY_OP From a0fb36d2a49a614241817b535e7793eb83358120 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Wed, 5 Nov 2025 11:19:23 -0800 Subject: [PATCH 02/28] Cleanup. Switch to default validation so we get 1 ULP of tolerance for floating point ops for now. --- tools/clang/unittests/HLSLExec/LongVectorTestData.h | 5 ++--- tools/clang/unittests/HLSLExec/ShaderOpArith.xml | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectorTestData.h b/tools/clang/unittests/HLSLExec/LongVectorTestData.h index 9c9cd58137..81b9d2cfef 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorTestData.h +++ b/tools/clang/unittests/HLSLExec/LongVectorTestData.h @@ -349,9 +349,8 @@ INPUT_SET(InputSet::SelectCond, 0, 1); END_INPUT_SETS() BEGIN_INPUT_SETS(HLSLHalf_t) -INPUT_SET(InputSet::Default1, -1.0, 1.0); -// INPUT_SET(InputSet::Default1, -1.0, -1.0, 1.0, -0.01, 1.0, -0.01, 1.0, -0.01, -// 1.0, -0.01); +INPUT_SET(InputSet::Default1, -1.0, -1.0, 1.0, -0.01, 1.0, -0.01, 1.0, -0.01, + 1.0, -0.01); INPUT_SET(InputSet::Default2, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); INPUT_SET(InputSet::Default3, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index 99e1b14fa3..b48e94bf3d 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4141,6 +4141,7 @@ void MSMain(uint GID : SV_GroupIndex, #else const uint32_t OutNum = NUM; #endif + #if IS_UNARY_OP vector OutputVector = FUNC(Input1); #elif IS_BINARY_OP From 344cafd370b086e165668c0f05983309450aeb1c Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Wed, 5 Nov 2025 20:36:56 -0800 Subject: [PATCH 03/28] check device in test method setup. default to min wave size instead --- .../clang/unittests/HLSLExec/LongVectors.cpp | 41 ------------------- 1 file changed, 41 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index 5ccd441e35..5a9f5d6f70 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1379,47 +1379,6 @@ void dispatchTest(ID3D12Device *D3DDevice, bool VerboseLogging, } } -static bool isWarpDevice(ID3D12Device *D3DDevice) { - DXASSERT_NOMSG(D3DDevice != nullptr); - - // Get the adapter LUID from the device - LUID AdapterLuid = D3DDevice->GetAdapterLuid(); - - // Create a DXGI factory to enumerate adapters - CComPtr DXGIFactory; - HRESULT HR = CreateDXGIFactory1(IID_PPV_ARGS(&DXGIFactory)); - if (FAILED(HR)) { - hlsl_test::LogCommentFmt( - L"isWarpDevice: Failed to create DXGI factory, HR=0x%08x", HR); - return false; - } - - // Get the adapter by LUID - CComPtr DXGIAdapter; - HR = DXGIFactory->EnumAdapterByLuid(AdapterLuid, IID_PPV_ARGS(&DXGIAdapter)); - if (FAILED(HR) || !DXGIAdapter) { - hlsl_test::LogCommentFmt( - L"isWarpDevice: Failed to enumerate adapter by LUID, HR=0x%08x", HR); - return false; - } - - DXGI_ADAPTER_DESC1 Desc{}; - HR = DXGIAdapter->GetDesc1(&Desc); - if (FAILED(HR)) { - hlsl_test::LogCommentFmt( - L"isWarpDevice: Failed to get adapter description, HR=0x%08x", HR); - return false; - } - - // Check for WARP adapter (VendorId 0x1414, DeviceId 0x8c) - const bool IsWarp = (Desc.VendorId == 0x1414 && Desc.DeviceId == 0x8c); - hlsl_test::LogCommentFmt( - L"isWarpDevice: VendorId=0x%04x, DeviceId=0x%04x, IsWarp=%d", - Desc.VendorId, Desc.DeviceId, IsWarp); - - return IsWarp; -} - template void dispatchWaveOpTest(ID3D12Device *D3DDevice, bool VerboseLogging, size_t OverrideInputSize, UINT WaveSize) { From 46011c30afd61a6e66990807126e812f0a35abce Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Wed, 5 Nov 2025 21:27:56 -0800 Subject: [PATCH 04/28] WIP on some other WaveActive ops --- .../unittests/HLSLExec/LongVectorOps.def | 5 ++ .../unittests/HLSLExec/LongVectorTestData.h | 9 +++ .../clang/unittests/HLSLExec/LongVectors.cpp | 70 +++++++++++++++++-- .../unittests/HLSLExec/ShaderOpArith.xml | 36 ++++++++++ 4 files changed, 115 insertions(+), 5 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectorOps.def b/tools/clang/unittests/HLSLExec/LongVectorOps.def index 2b4f4f2dca..2e5bffd9c0 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorOps.def +++ b/tools/clang/unittests/HLSLExec/LongVectorOps.def @@ -19,6 +19,7 @@ INPUT_SET(Positive) INPUT_SET(Bitwise) INPUT_SET(SelectCond) INPUT_SET(FloatSpecial) +INPUT_SET(AllOnes) #undef INPUT_SET @@ -194,5 +195,9 @@ OP_LOAD_AND_STORE_SB(LoadAndStore_RD_SB_SRV, "RootDescriptor_SRV") #undef OP_LOAD_AND_STORE_DEFINES OP_DEFAULT(Wave, WaveActiveSum, 1, "WaveActiveSum", "") +OP_DEFAULT_DEFINES(Wave, WaveActiveMin, 1, "TestWaveActiveMin", "", " -DFUNC_WAVE_ACTIVE_MIN=1") +OP_DEFAULT_DEFINES(Wave, WaveActiveMax, 1, "TestWaveActiveMax", "", " -DFUNC_WAVE_ACTIVE_MAX=1") +OP(Wave, WaveActiveProduct, 1, "TestWaveActiveProduct", "", " -DFUNC_WAVE_ACTIVE_PRODUCT=1", "LongVectorOp", + AllOnes, Default2, Default3) #undef OP diff --git a/tools/clang/unittests/HLSLExec/LongVectorTestData.h b/tools/clang/unittests/HLSLExec/LongVectorTestData.h index 81b9d2cfef..519f8a8b63 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorTestData.h +++ b/tools/clang/unittests/HLSLExec/LongVectorTestData.h @@ -289,6 +289,7 @@ INPUT_SET(InputSet::Bitwise, std::numeric_limits::min(), -1, 0, 1, 3, 6, 9, 0x5555, static_cast(0xAAAA), std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); +INPUT_SET(InputSet::AllOnes, 1); END_INPUT_SETS() BEGIN_INPUT_SETS(int32_t) @@ -302,6 +303,7 @@ INPUT_SET(InputSet::Bitwise, std::numeric_limits::min(), -1, 0, 1, 3, 6, 9, 0x55555555, static_cast(0xAAAAAAAA), std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); +INPUT_SET(InputSet::AllOnes, 1); END_INPUT_SETS() BEGIN_INPUT_SETS(int64_t) @@ -315,6 +317,7 @@ INPUT_SET(InputSet::Bitwise, std::numeric_limits::min(), -1, 0, 1, 3, 6, 9, 0x5555555555555555LL, 0xAAAAAAAAAAAAAAAALL, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); +INPUT_SET(InputSet::AllOnes, 1); END_INPUT_SETS() BEGIN_INPUT_SETS(uint16_t) @@ -325,6 +328,7 @@ INPUT_SET(InputSet::BitShiftRhs, 1, 6, 3, 0, 9, 3, 12, 13, 14, 15); INPUT_SET(InputSet::Bitwise, 0, 1, 3, 6, 9, 0x5555, 0xAAAA, 0x8000, 127, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); +INPUT_SET(InputSet::AllOnes, 1); END_INPUT_SETS() BEGIN_INPUT_SETS(uint32_t) @@ -335,6 +339,7 @@ INPUT_SET(InputSet::BitShiftRhs, 1, 6, 3, 0, 9, 3, 30, 31, 32); INPUT_SET(InputSet::Bitwise, 0, 1, 3, 6, 9, 0x55555555, 0xAAAAAAAA, 0x80000000, 127, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); +INPUT_SET(InputSet::AllOnes, 1); END_INPUT_SETS() BEGIN_INPUT_SETS(uint64_t) @@ -346,6 +351,7 @@ INPUT_SET(InputSet::Bitwise, 0, 1, 3, 6, 9, 0x5555555555555555, 0xAAAAAAAAAAAAAAAA, 0x8000000000000000, 127, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); +INPUT_SET(InputSet::AllOnes, 1); END_INPUT_SETS() BEGIN_INPUT_SETS(HLSLHalf_t) @@ -376,6 +382,7 @@ INPUT_SET(InputSet::FloatSpecial, std::numeric_limits::infinity(), -std::numeric_limits::max(), std::numeric_limits::denorm_min(), std::numeric_limits::denorm_min() * 10.0, 1.0 / 3.0); +INPUT_SET(InputSet::AllOnes, 1.0); END_INPUT_SETS() BEGIN_INPUT_SETS(float) @@ -403,6 +410,7 @@ INPUT_SET(InputSet::FloatSpecial, std::numeric_limits::infinity(), -std::numeric_limits::max(), std::numeric_limits::denorm_min(), std::numeric_limits::denorm_min() * 10.0f, 1.0f / 3.0f); +INPUT_SET(InputSet::AllOnes, 1.0f); END_INPUT_SETS() BEGIN_INPUT_SETS(double) @@ -421,6 +429,7 @@ INPUT_SET(InputSet::SplitDouble, 0.0, -1.0, 1.0, -1.0, 12345678.87654321, -1.0, INPUT_SET(InputSet::Positive, 1.0, 1.0, 65535.0, 0.01, 5531.0, 0.01, 1.0, 0.01, 331.2330, 3250.01); INPUT_SET(InputSet::SelectCond, 0.0, 1.0); +INPUT_SET(InputSet::AllOnes, 1.0); END_INPUT_SETS() #undef BEGIN_INPUT_SETS diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index 5a9f5d6f70..3096eee2e0 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1266,10 +1266,44 @@ FLOAT_SPECIAL_OP(OpType::IsNan, (std::isnan(A))); #define WAVE_ACTIVE_OP(OP, IMPL) \ template struct Op : DefaultValidation { \ - T operator()(T A, T WaveSize) { return IMPL; } \ + T operator()(T A, UINT WaveSize) { return IMPL; } \ }; -WAVE_ACTIVE_OP(OpType::WaveActiveSum, (A * WaveSize)); +template T WaveActiveSumFn(T A, UINT WaveSize) { + T WaveSizeT = static_cast(WaveSize); + return A * WaveSizeT; +} + +WAVE_ACTIVE_OP(OpType::WaveActiveSum, (WaveActiveSumFn(A, WaveSize))); + +template T WaveActiveMinFn(T A, UINT WaveSize) { + std::vector Values; + // Add the 'WaveLaneID' to A. + for (UINT I = 0; I < WaveSize; ++I) + Values.push_back(A + static_cast(I)); + return *std::min_element(Values.begin(), Values.end()); +} + +WAVE_ACTIVE_OP(OpType::WaveActiveMin, (WaveActiveMinFn(A, WaveSize))); + +template T WaveActiveMaxFn(T A, UINT WaveSize) { + std::vector Values; + // Add the 'WaveLaneID' to A. + for (UINT I = 0; I < WaveSize; ++I) + Values.push_back(A + static_cast(I)); + return *std::max_element(Values.begin(), Values.end()); +} + +WAVE_ACTIVE_OP(OpType::WaveActiveMax, (WaveActiveMaxFn(A, WaveSize))); + +template T WaveActiveProductFn(T A, UINT WaveSize) { + // We want to avoid overflow of a large product. So, the WaveActiveProdFn has + // an input set of all 1's and we modify the value of the largest lane to be + // equal to the lane index in the shader. + return A * static_cast(WaveSize - 1); +} + +WAVE_ACTIVE_OP(OpType::WaveActiveProduct, (WaveActiveProductFn(A, WaveSize))); #undef WAVE_ACTIVE_OP @@ -1321,13 +1355,12 @@ template struct WaveOpExpectedBuilder { static auto buildExpected(Op Op, const InputSets &Inputs, UINT WaveSize) { DXASSERT_NOMSG(Inputs.size() == 1); - const T WaveSizeT = static_cast(WaveSize); - std::vector Expected; + std::vector Expected; Expected.reserve(Inputs[0].size()); for (size_t I = 0; I < Inputs[0].size(); ++I) - Expected.push_back(Op(Inputs[0][I], WaveSizeT)); + Expected.push_back(Op(Inputs[0][I], WaveSize)); return Expected; } @@ -2166,16 +2199,43 @@ class DxilConf_SM69_Vectorized { HLK_TEST(LoadAndStore_RD_SB_UAV, double); HLK_WAVEOP_TEST(WaveActiveSum, int16_t); + HLK_WAVEOP_TEST(WaveActiveMin, int16_t); + HLK_WAVEOP_TEST(WaveActiveMax, int16_t); + HLK_WAVEOP_TEST(WaveActiveProduct, int16_t); HLK_WAVEOP_TEST(WaveActiveSum, int32_t); + HLK_WAVEOP_TEST(WaveActiveMin, int32_t); + HLK_WAVEOP_TEST(WaveActiveMax, int32_t); + HLK_WAVEOP_TEST(WaveActiveProduct, int32_t); HLK_WAVEOP_TEST(WaveActiveSum, int64_t); + HLK_WAVEOP_TEST(WaveActiveMin, int64_t); + HLK_WAVEOP_TEST(WaveActiveMax, int64_t); + HLK_WAVEOP_TEST(WaveActiveProduct, int64_t); HLK_WAVEOP_TEST(WaveActiveSum, uint16_t); + HLK_WAVEOP_TEST(WaveActiveMin, uint16_t); + HLK_WAVEOP_TEST(WaveActiveMax, uint16_t); + HLK_WAVEOP_TEST(WaveActiveProduct, uint16_t); HLK_WAVEOP_TEST(WaveActiveSum, uint32_t); + HLK_WAVEOP_TEST(WaveActiveMin, uint32_t); + HLK_WAVEOP_TEST(WaveActiveMax, uint32_t); + HLK_WAVEOP_TEST(WaveActiveProduct, uint32_t); HLK_WAVEOP_TEST(WaveActiveSum, uint64_t); + HLK_WAVEOP_TEST(WaveActiveMin, uint64_t); + HLK_WAVEOP_TEST(WaveActiveMax, uint64_t); + HLK_WAVEOP_TEST(WaveActiveProduct, uint64_t); HLK_WAVEOP_TEST(WaveActiveSum, HLSLHalf_t); + HLK_WAVEOP_TEST(WaveActiveMin, HLSLHalf_t); + HLK_WAVEOP_TEST(WaveActiveMax, HLSLHalf_t); + HLK_WAVEOP_TEST(WaveActiveProduct, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveSum, float); + HLK_WAVEOP_TEST(WaveActiveMin, float); + HLK_WAVEOP_TEST(WaveActiveMax, float); + HLK_WAVEOP_TEST(WaveActiveProduct, float); HLK_WAVEOP_TEST(WaveActiveSum, double); + HLK_WAVEOP_TEST(WaveActiveMin, double); + HLK_WAVEOP_TEST(WaveActiveMax, double); + HLK_WAVEOP_TEST(WaveActiveProduct, double); private: bool Initialized = false; diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index b48e94bf3d..48815aefa7 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4083,6 +4083,34 @@ void MSMain(uint GID : SV_GroupIndex, } #endif + #ifdef FUNC_WAVE_ACTIVE_MIN + vector TestWaveActiveMin(vector Vector) + { + Vector += WaveGetLaneIndex(); + return WaveActiveMin(Vector); + } + #endif + + #ifdef FUNC_WAVE_ACTIVE_MAX + vector TestWaveActiveMax(vector Vector) + { + Vector += WaveGetLaneIndex(); + return WaveActiveMax(Vector); + } + #endif + + #ifdef FUNC_WAVE_ACTIVE_PRODUCT + vector TestWaveActiveProduct(vector Vector) + { + uint LaneIndex = WaveGetLaneIndex(); + if(LaneIndex == (WaveGetLaneCount() - 1)) + { + Vector = LaneIndex; + } + return WaveActiveProduct(Vector); + } + #endif + #ifdef FUNC_TEST_SELECT vector TestSelect(vector Vector1, vector Vector2, @@ -4142,7 +4170,15 @@ void MSMain(uint GID : SV_GroupIndex, const uint32_t OutNum = NUM; #endif + // Modify input for wave ops + // Below is only use for Max/Min + Input1 += WaveGetLaneIndex(); + + uint LaneIndex = WaveGetLaneIndex(); + uint WaveSize = WaveGetLaneCount(); + #if IS_UNARY_OP + // TODO: Factor this out. Just being quick for implementation. vector OutputVector = FUNC(Input1); #elif IS_BINARY_OP vector OutputVector = FUNC(Input1 OPERATOR From 0d866c820a8cb71a5fcb12216d6928a290908e64 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Thu, 6 Nov 2025 11:03:12 -0800 Subject: [PATCH 05/28] Fix XML --- tools/clang/unittests/HLSLExec/ShaderOpArith.xml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index 48815aefa7..ded60f1192 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4170,13 +4170,6 @@ void MSMain(uint GID : SV_GroupIndex, const uint32_t OutNum = NUM; #endif - // Modify input for wave ops - // Below is only use for Max/Min - Input1 += WaveGetLaneIndex(); - - uint LaneIndex = WaveGetLaneIndex(); - uint WaveSize = WaveGetLaneCount(); - #if IS_UNARY_OP // TODO: Factor this out. Just being quick for implementation. vector OutputVector = FUNC(Input1); From b1bbfb959abc8cd29b7653463d34aee6c952c64a Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Thu, 6 Nov 2025 13:37:46 -0800 Subject: [PATCH 06/28] WIP --- tools/clang/unittests/HLSLExec/LongVectors.cpp | 14 ++++++++++++++ tools/clang/unittests/HLSLExec/ShaderOpArith.xml | 12 ++++++++++++ 2 files changed, 26 insertions(+) diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index 3096eee2e0..b5b67ca359 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1305,6 +1305,14 @@ template T WaveActiveProductFn(T A, UINT WaveSize) { WAVE_ACTIVE_OP(OpType::WaveActiveProduct, (WaveActiveProductFn(A, WaveSize))); +template T WaveActiveBitAndFn(T A, UINT WaveSize) { + std::vector Values; + for (UINT I = 0; I < WaveSize; ++I) + Values.push_back(A & static_cast(~0 << I)); +} + +WAVE_ACTIVE_OP(OpType::WaveActiveBitAnd, (WaveActiveBitAndFn(A, WaveSize))); + #undef WAVE_ACTIVE_OP // @@ -2201,28 +2209,34 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveSum, int16_t); HLK_WAVEOP_TEST(WaveActiveMin, int16_t); HLK_WAVEOP_TEST(WaveActiveMax, int16_t); + HLK_WAVEOP_TEST(WaveActiveBitAnd, int16_t); HLK_WAVEOP_TEST(WaveActiveProduct, int16_t); HLK_WAVEOP_TEST(WaveActiveSum, int32_t); HLK_WAVEOP_TEST(WaveActiveMin, int32_t); HLK_WAVEOP_TEST(WaveActiveMax, int32_t); HLK_WAVEOP_TEST(WaveActiveProduct, int32_t); + HLK_WAVEOP_TEST(WaveActiveBitAnd, int32_t); HLK_WAVEOP_TEST(WaveActiveSum, int64_t); HLK_WAVEOP_TEST(WaveActiveMin, int64_t); HLK_WAVEOP_TEST(WaveActiveMax, int64_t); HLK_WAVEOP_TEST(WaveActiveProduct, int64_t); + HLK_WAVEOP_TEST(WaveActiveBitAnd, int64_t); HLK_WAVEOP_TEST(WaveActiveSum, uint16_t); HLK_WAVEOP_TEST(WaveActiveMin, uint16_t); HLK_WAVEOP_TEST(WaveActiveMax, uint16_t); HLK_WAVEOP_TEST(WaveActiveProduct, uint16_t); + HLK_WAVEOP_TEST(WaveActiveBitAnd, uint16_t); HLK_WAVEOP_TEST(WaveActiveSum, uint32_t); HLK_WAVEOP_TEST(WaveActiveMin, uint32_t); HLK_WAVEOP_TEST(WaveActiveMax, uint32_t); HLK_WAVEOP_TEST(WaveActiveProduct, uint32_t); + HLK_WAVEOP_TEST(WaveActiveBitAnd, uint32_t); HLK_WAVEOP_TEST(WaveActiveSum, uint64_t); HLK_WAVEOP_TEST(WaveActiveMin, uint64_t); HLK_WAVEOP_TEST(WaveActiveMax, uint64_t); HLK_WAVEOP_TEST(WaveActiveProduct, uint64_t); + HLK_WAVEOP_TEST(WaveActiveBitAnd, uint64_t); HLK_WAVEOP_TEST(WaveActiveSum, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveMin, HLSLHalf_t); diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index ded60f1192..7e8e9683ae 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4111,6 +4111,18 @@ void MSMain(uint GID : SV_GroupIndex, } #endif + #ifdef FUNC_WAVE_ACTIVE_BIT_AND + vector TestWaveActiveBitAnd(vector Vector) + { + uint LaneIndex = WaveGetLaneIndex(); + if(LaneIndex == (WaveGetLaneCount() - 1)) + { + Vector = LaneIndex; + } + return WaveActiveBitAnd(Vector); + } + #endif + #ifdef FUNC_TEST_SELECT vector TestSelect(vector Vector1, vector Vector2, From 9d06bdd8a6fc6e42298359b77f1597fd973344c7 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Thu, 6 Nov 2025 13:56:27 -0800 Subject: [PATCH 07/28] Remove WaveActiveBitAnd --- tools/clang/unittests/HLSLExec/LongVectors.cpp | 14 -------------- tools/clang/unittests/HLSLExec/ShaderOpArith.xml | 12 ------------ 2 files changed, 26 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index b5b67ca359..3096eee2e0 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1305,14 +1305,6 @@ template T WaveActiveProductFn(T A, UINT WaveSize) { WAVE_ACTIVE_OP(OpType::WaveActiveProduct, (WaveActiveProductFn(A, WaveSize))); -template T WaveActiveBitAndFn(T A, UINT WaveSize) { - std::vector Values; - for (UINT I = 0; I < WaveSize; ++I) - Values.push_back(A & static_cast(~0 << I)); -} - -WAVE_ACTIVE_OP(OpType::WaveActiveBitAnd, (WaveActiveBitAndFn(A, WaveSize))); - #undef WAVE_ACTIVE_OP // @@ -2209,34 +2201,28 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveSum, int16_t); HLK_WAVEOP_TEST(WaveActiveMin, int16_t); HLK_WAVEOP_TEST(WaveActiveMax, int16_t); - HLK_WAVEOP_TEST(WaveActiveBitAnd, int16_t); HLK_WAVEOP_TEST(WaveActiveProduct, int16_t); HLK_WAVEOP_TEST(WaveActiveSum, int32_t); HLK_WAVEOP_TEST(WaveActiveMin, int32_t); HLK_WAVEOP_TEST(WaveActiveMax, int32_t); HLK_WAVEOP_TEST(WaveActiveProduct, int32_t); - HLK_WAVEOP_TEST(WaveActiveBitAnd, int32_t); HLK_WAVEOP_TEST(WaveActiveSum, int64_t); HLK_WAVEOP_TEST(WaveActiveMin, int64_t); HLK_WAVEOP_TEST(WaveActiveMax, int64_t); HLK_WAVEOP_TEST(WaveActiveProduct, int64_t); - HLK_WAVEOP_TEST(WaveActiveBitAnd, int64_t); HLK_WAVEOP_TEST(WaveActiveSum, uint16_t); HLK_WAVEOP_TEST(WaveActiveMin, uint16_t); HLK_WAVEOP_TEST(WaveActiveMax, uint16_t); HLK_WAVEOP_TEST(WaveActiveProduct, uint16_t); - HLK_WAVEOP_TEST(WaveActiveBitAnd, uint16_t); HLK_WAVEOP_TEST(WaveActiveSum, uint32_t); HLK_WAVEOP_TEST(WaveActiveMin, uint32_t); HLK_WAVEOP_TEST(WaveActiveMax, uint32_t); HLK_WAVEOP_TEST(WaveActiveProduct, uint32_t); - HLK_WAVEOP_TEST(WaveActiveBitAnd, uint32_t); HLK_WAVEOP_TEST(WaveActiveSum, uint64_t); HLK_WAVEOP_TEST(WaveActiveMin, uint64_t); HLK_WAVEOP_TEST(WaveActiveMax, uint64_t); HLK_WAVEOP_TEST(WaveActiveProduct, uint64_t); - HLK_WAVEOP_TEST(WaveActiveBitAnd, uint64_t); HLK_WAVEOP_TEST(WaveActiveSum, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveMin, HLSLHalf_t); diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index 7e8e9683ae..ded60f1192 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4111,18 +4111,6 @@ void MSMain(uint GID : SV_GroupIndex, } #endif - #ifdef FUNC_WAVE_ACTIVE_BIT_AND - vector TestWaveActiveBitAnd(vector Vector) - { - uint LaneIndex = WaveGetLaneIndex(); - if(LaneIndex == (WaveGetLaneCount() - 1)) - { - Vector = LaneIndex; - } - return WaveActiveBitAnd(Vector); - } - #endif - #ifdef FUNC_TEST_SELECT vector TestSelect(vector Vector1, vector Vector2, From 93f43c174d4d3d5ee0c53744086152d1352ae80e Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Thu, 6 Nov 2025 19:59:57 -0800 Subject: [PATCH 08/28] Naming --- tools/clang/unittests/HLSLExec/LongVectors.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index 3096eee2e0..04f653459b 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1269,14 +1269,14 @@ FLOAT_SPECIAL_OP(OpType::IsNan, (std::isnan(A))); T operator()(T A, UINT WaveSize) { return IMPL; } \ }; -template T WaveActiveSumFn(T A, UINT WaveSize) { +template T waveActiveSum(T A, UINT WaveSize) { T WaveSizeT = static_cast(WaveSize); return A * WaveSizeT; } -WAVE_ACTIVE_OP(OpType::WaveActiveSum, (WaveActiveSumFn(A, WaveSize))); +WAVE_ACTIVE_OP(OpType::WaveActiveSum, (waveActiveSum(A, WaveSize))); -template T WaveActiveMinFn(T A, UINT WaveSize) { +template T waveActiveMin(T A, UINT WaveSize) { std::vector Values; // Add the 'WaveLaneID' to A. for (UINT I = 0; I < WaveSize; ++I) @@ -1284,9 +1284,9 @@ template T WaveActiveMinFn(T A, UINT WaveSize) { return *std::min_element(Values.begin(), Values.end()); } -WAVE_ACTIVE_OP(OpType::WaveActiveMin, (WaveActiveMinFn(A, WaveSize))); +WAVE_ACTIVE_OP(OpType::WaveActiveMin, (waveActiveMin(A, WaveSize))); -template T WaveActiveMaxFn(T A, UINT WaveSize) { +template T waveActiveMax(T A, UINT WaveSize) { std::vector Values; // Add the 'WaveLaneID' to A. for (UINT I = 0; I < WaveSize; ++I) @@ -1294,16 +1294,16 @@ template T WaveActiveMaxFn(T A, UINT WaveSize) { return *std::max_element(Values.begin(), Values.end()); } -WAVE_ACTIVE_OP(OpType::WaveActiveMax, (WaveActiveMaxFn(A, WaveSize))); +WAVE_ACTIVE_OP(OpType::WaveActiveMax, (waveActiveMax(A, WaveSize))); -template T WaveActiveProductFn(T A, UINT WaveSize) { +template T waveActiveProduct(T A, UINT WaveSize) { // We want to avoid overflow of a large product. So, the WaveActiveProdFn has // an input set of all 1's and we modify the value of the largest lane to be // equal to the lane index in the shader. return A * static_cast(WaveSize - 1); } -WAVE_ACTIVE_OP(OpType::WaveActiveProduct, (WaveActiveProductFn(A, WaveSize))); +WAVE_ACTIVE_OP(OpType::WaveActiveProduct, (waveActiveProduct(A, WaveSize))); #undef WAVE_ACTIVE_OP From 1148fdd1e1f30f21dfb2a3388b66c1b92814110e Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Fri, 7 Nov 2025 09:27:13 -0800 Subject: [PATCH 09/28] Remove todo in xml --- tools/clang/unittests/HLSLExec/ShaderOpArith.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index ded60f1192..b2031d3583 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4171,7 +4171,6 @@ void MSMain(uint GID : SV_GroupIndex, #endif #if IS_UNARY_OP - // TODO: Factor this out. Just being quick for implementation. vector OutputVector = FUNC(Input1); #elif IS_BINARY_OP vector OutputVector = FUNC(Input1 OPERATOR From 0e19c0379323c97eb60ca06891edf1d7ec9fb49d Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Fri, 7 Nov 2025 16:14:29 -0800 Subject: [PATCH 10/28] WaveActiveBit Ops --- .../unittests/HLSLExec/LongVectorOps.def | 8 +++-- .../unittests/HLSLExec/LongVectorTestData.h | 18 +++++----- .../clang/unittests/HLSLExec/LongVectors.cpp | 31 ++++++++++++++++ .../unittests/HLSLExec/ShaderOpArith.xml | 35 +++++++++++++++++++ 4 files changed, 80 insertions(+), 12 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectorOps.def b/tools/clang/unittests/HLSLExec/LongVectorOps.def index 2e5bffd9c0..f49c04663e 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorOps.def +++ b/tools/clang/unittests/HLSLExec/LongVectorOps.def @@ -19,7 +19,7 @@ INPUT_SET(Positive) INPUT_SET(Bitwise) INPUT_SET(SelectCond) INPUT_SET(FloatSpecial) -INPUT_SET(AllOnes) +INPUT_SET(AllScalarOnes) #undef INPUT_SET @@ -197,7 +197,9 @@ OP_LOAD_AND_STORE_SB(LoadAndStore_RD_SB_SRV, "RootDescriptor_SRV") OP_DEFAULT(Wave, WaveActiveSum, 1, "WaveActiveSum", "") OP_DEFAULT_DEFINES(Wave, WaveActiveMin, 1, "TestWaveActiveMin", "", " -DFUNC_WAVE_ACTIVE_MIN=1") OP_DEFAULT_DEFINES(Wave, WaveActiveMax, 1, "TestWaveActiveMax", "", " -DFUNC_WAVE_ACTIVE_MAX=1") -OP(Wave, WaveActiveProduct, 1, "TestWaveActiveProduct", "", " -DFUNC_WAVE_ACTIVE_PRODUCT=1", "LongVectorOp", - AllOnes, Default2, Default3) +OP(Wave, WaveActiveProduct, 1, "TestWaveActiveProduct", "", " -DFUNC_WAVE_ACTIVE_PRODUCT=1", "LongVectorOp", AllScalarOnes, Default2, Default3) +OP_DEFAULT_DEFINES(Wave, WaveActiveBitAnd, 1, "TestWaveActiveBitAnd", "", " -DFUNC_WAVE_ACTIVE_BIT_AND=1") +OP_DEFAULT_DEFINES(Wave, WaveActiveBitOr, 1, "TestWaveActiveBitOr", "", " -DFUNC_WAVE_ACTIVE_BIT_OR=1") +OP_DEFAULT_DEFINES(Wave, WaveActiveBitXor, 1, "TestWaveActiveBitXor", "", " -DFUNC_WAVE_ACTIVE_BIT_XOR=1") #undef OP diff --git a/tools/clang/unittests/HLSLExec/LongVectorTestData.h b/tools/clang/unittests/HLSLExec/LongVectorTestData.h index 519f8a8b63..35144173ba 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorTestData.h +++ b/tools/clang/unittests/HLSLExec/LongVectorTestData.h @@ -289,7 +289,7 @@ INPUT_SET(InputSet::Bitwise, std::numeric_limits::min(), -1, 0, 1, 3, 6, 9, 0x5555, static_cast(0xAAAA), std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); -INPUT_SET(InputSet::AllOnes, 1); +INPUT_SET(InputSet::AllScalarOnes, 1); END_INPUT_SETS() BEGIN_INPUT_SETS(int32_t) @@ -303,7 +303,7 @@ INPUT_SET(InputSet::Bitwise, std::numeric_limits::min(), -1, 0, 1, 3, 6, 9, 0x55555555, static_cast(0xAAAAAAAA), std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); -INPUT_SET(InputSet::AllOnes, 1); +INPUT_SET(InputSet::AllScalarOnes, 1); END_INPUT_SETS() BEGIN_INPUT_SETS(int64_t) @@ -317,7 +317,7 @@ INPUT_SET(InputSet::Bitwise, std::numeric_limits::min(), -1, 0, 1, 3, 6, 9, 0x5555555555555555LL, 0xAAAAAAAAAAAAAAAALL, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); -INPUT_SET(InputSet::AllOnes, 1); +INPUT_SET(InputSet::AllScalarOnes, 1); END_INPUT_SETS() BEGIN_INPUT_SETS(uint16_t) @@ -328,7 +328,7 @@ INPUT_SET(InputSet::BitShiftRhs, 1, 6, 3, 0, 9, 3, 12, 13, 14, 15); INPUT_SET(InputSet::Bitwise, 0, 1, 3, 6, 9, 0x5555, 0xAAAA, 0x8000, 127, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); -INPUT_SET(InputSet::AllOnes, 1); +INPUT_SET(InputSet::AllScalarOnes, 1); END_INPUT_SETS() BEGIN_INPUT_SETS(uint32_t) @@ -339,7 +339,7 @@ INPUT_SET(InputSet::BitShiftRhs, 1, 6, 3, 0, 9, 3, 30, 31, 32); INPUT_SET(InputSet::Bitwise, 0, 1, 3, 6, 9, 0x55555555, 0xAAAAAAAA, 0x80000000, 127, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); -INPUT_SET(InputSet::AllOnes, 1); +INPUT_SET(InputSet::AllScalarOnes, 1); END_INPUT_SETS() BEGIN_INPUT_SETS(uint64_t) @@ -351,7 +351,7 @@ INPUT_SET(InputSet::Bitwise, 0, 1, 3, 6, 9, 0x5555555555555555, 0xAAAAAAAAAAAAAAAA, 0x8000000000000000, 127, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); -INPUT_SET(InputSet::AllOnes, 1); +INPUT_SET(InputSet::AllScalarOnes, 1); END_INPUT_SETS() BEGIN_INPUT_SETS(HLSLHalf_t) @@ -382,7 +382,7 @@ INPUT_SET(InputSet::FloatSpecial, std::numeric_limits::infinity(), -std::numeric_limits::max(), std::numeric_limits::denorm_min(), std::numeric_limits::denorm_min() * 10.0, 1.0 / 3.0); -INPUT_SET(InputSet::AllOnes, 1.0); +INPUT_SET(InputSet::AllScalarOnes, 1.0); END_INPUT_SETS() BEGIN_INPUT_SETS(float) @@ -410,7 +410,7 @@ INPUT_SET(InputSet::FloatSpecial, std::numeric_limits::infinity(), -std::numeric_limits::max(), std::numeric_limits::denorm_min(), std::numeric_limits::denorm_min() * 10.0f, 1.0f / 3.0f); -INPUT_SET(InputSet::AllOnes, 1.0f); +INPUT_SET(InputSet::AllScalarOnes, 1.0f); END_INPUT_SETS() BEGIN_INPUT_SETS(double) @@ -429,7 +429,7 @@ INPUT_SET(InputSet::SplitDouble, 0.0, -1.0, 1.0, -1.0, 12345678.87654321, -1.0, INPUT_SET(InputSet::Positive, 1.0, 1.0, 65535.0, 0.01, 5531.0, 0.01, 1.0, 0.01, 331.2330, 3250.01); INPUT_SET(InputSet::SelectCond, 0.0, 1.0); -INPUT_SET(InputSet::AllOnes, 1.0); +INPUT_SET(InputSet::AllScalarOnes, 1.0); END_INPUT_SETS() #undef BEGIN_INPUT_SETS diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index 04f653459b..b560a754eb 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1305,6 +1305,30 @@ template T waveActiveProduct(T A, UINT WaveSize) { WAVE_ACTIVE_OP(OpType::WaveActiveProduct, (waveActiveProduct(A, WaveSize))); +template T waveActiveBitAnd(T A, UINT WaveSize) { + UNREFERENCED_PARAMETER(WaveSize); + // We set the LSB to 0 in one of the lanes. + return static_cast(A & ~static_cast(1)); +} + +WAVE_ACTIVE_OP(OpType::WaveActiveBitAnd, (waveActiveBitAnd(A, WaveSize))); + +template T waveActiveBitOr(T A, UINT WaveSize) { + UNREFERENCED_PARAMETER(WaveSize); + // We set the LSB to 0 in one of the lanes. + return static_cast(A | static_cast(1)); +} + +WAVE_ACTIVE_OP(OpType::WaveActiveBitOr, (waveActiveBitOr(A, WaveSize))); + +template T waveActiveBitXor(T A, UINT WaveSize) { + UNREFERENCED_PARAMETER(WaveSize); + // We clear the LSB in every lane except the last lane which sets it to 1. + return static_cast(A | static_cast(1)); +} + +WAVE_ACTIVE_OP(OpType::WaveActiveBitXor, (waveActiveBitXor(A, WaveSize))); + #undef WAVE_ACTIVE_OP // @@ -2219,10 +2243,17 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveMin, uint32_t); HLK_WAVEOP_TEST(WaveActiveMax, uint32_t); HLK_WAVEOP_TEST(WaveActiveProduct, uint32_t); + // Note: WaveActiveBit* ops don't support uint16_t in HLSL + HLK_WAVEOP_TEST(WaveActiveBitAnd, uint32_t); + HLK_WAVEOP_TEST(WaveActiveBitOr, uint32_t); + HLK_WAVEOP_TEST(WaveActiveBitXor, uint32_t); HLK_WAVEOP_TEST(WaveActiveSum, uint64_t); HLK_WAVEOP_TEST(WaveActiveMin, uint64_t); HLK_WAVEOP_TEST(WaveActiveMax, uint64_t); HLK_WAVEOP_TEST(WaveActiveProduct, uint64_t); + HLK_WAVEOP_TEST(WaveActiveBitAnd, uint64_t); + HLK_WAVEOP_TEST(WaveActiveBitOr, uint64_t); + HLK_WAVEOP_TEST(WaveActiveBitXor, uint64_t); HLK_WAVEOP_TEST(WaveActiveSum, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveMin, HLSLHalf_t); diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index b2031d3583..d3676ed2d5 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4111,6 +4111,41 @@ void MSMain(uint GID : SV_GroupIndex, } #endif + #ifdef FUNC_WAVE_ACTIVE_BIT_AND + vector TestWaveActiveBitAnd(vector Vector) + { + if(WaveGetLaneIndex() == (WaveGetLaneCount() - 1)) + { + // Clear the LSB on the last lane only. + Vector = Vector & ~((OUT_TYPE)1); + } + return WaveActiveBitAnd(Vector); + } + #endif + + #ifdef FUNC_WAVE_ACTIVE_BIT_OR + vector TestWaveActiveBitOr(vector Vector) + { + if(WaveGetLaneIndex() == (WaveGetLaneCount() - 1)) + { + // Set the LSB on the last lane only. + Vector = Vector | ((OUT_TYPE)1); + } + return WaveActiveBitOr(Vector); + } + #endif + + #ifdef FUNC_WAVE_ACTIVE_BIT_XOR + vector TestWaveActiveBitXor(vector Vector) + { + const uint isChosen = (WaveGetLaneIndex() == 0) ? 1 : 0; + // Clear the LSB for all lanes except lane 0, which sets it to 1. + Vector = (Vector & ~((OUT_TYPE)1)) | (OUT_TYPE)isChosen; + + return WaveActiveBitOr(Vector); + } + #endif + #ifdef FUNC_TEST_SELECT vector TestSelect(vector Vector1, vector Vector2, From 8779d3fa7677c72aa7dc123593cb3e5f249c7f90 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Fri, 7 Nov 2025 16:18:50 -0800 Subject: [PATCH 11/28] Unreferenced --- tools/clang/unittests/HLSLExec/LongVectors.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index b560a754eb..4dd8a76062 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1305,24 +1305,21 @@ template T waveActiveProduct(T A, UINT WaveSize) { WAVE_ACTIVE_OP(OpType::WaveActiveProduct, (waveActiveProduct(A, WaveSize))); -template T waveActiveBitAnd(T A, UINT WaveSize) { - UNREFERENCED_PARAMETER(WaveSize); +template T waveActiveBitAnd(T A, UINT) { // We set the LSB to 0 in one of the lanes. return static_cast(A & ~static_cast(1)); } WAVE_ACTIVE_OP(OpType::WaveActiveBitAnd, (waveActiveBitAnd(A, WaveSize))); -template T waveActiveBitOr(T A, UINT WaveSize) { - UNREFERENCED_PARAMETER(WaveSize); +template T waveActiveBitOr(T A, UINT) { // We set the LSB to 0 in one of the lanes. return static_cast(A | static_cast(1)); } WAVE_ACTIVE_OP(OpType::WaveActiveBitOr, (waveActiveBitOr(A, WaveSize))); -template T waveActiveBitXor(T A, UINT WaveSize) { - UNREFERENCED_PARAMETER(WaveSize); +template T waveActiveBitXor(T A, UINT) { // We clear the LSB in every lane except the last lane which sets it to 1. return static_cast(A | static_cast(1)); } From c32ba7d20a6ef904f5e0d53694cdb26c4561b871 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Fri, 7 Nov 2025 17:56:34 -0800 Subject: [PATCH 12/28] Add AllEqual --- .../unittests/HLSLExec/LongVectorOps.def | 1 + .../clang/unittests/HLSLExec/LongVectors.cpp | 30 +++++++++++++++++++ .../unittests/HLSLExec/ShaderOpArith.xml | 12 ++++++++ 3 files changed, 43 insertions(+) diff --git a/tools/clang/unittests/HLSLExec/LongVectorOps.def b/tools/clang/unittests/HLSLExec/LongVectorOps.def index f49c04663e..9857284cb1 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorOps.def +++ b/tools/clang/unittests/HLSLExec/LongVectorOps.def @@ -201,5 +201,6 @@ OP(Wave, WaveActiveProduct, 1, "TestWaveActiveProduct", "", " -DFUNC_WAVE_ACTIVE OP_DEFAULT_DEFINES(Wave, WaveActiveBitAnd, 1, "TestWaveActiveBitAnd", "", " -DFUNC_WAVE_ACTIVE_BIT_AND=1") OP_DEFAULT_DEFINES(Wave, WaveActiveBitOr, 1, "TestWaveActiveBitOr", "", " -DFUNC_WAVE_ACTIVE_BIT_OR=1") OP_DEFAULT_DEFINES(Wave, WaveActiveBitXor, 1, "TestWaveActiveBitXor", "", " -DFUNC_WAVE_ACTIVE_BIT_XOR=1") +OP_DEFAULT_DEFINES(Wave, WaveActiveAllEqual, 1, "TestWaveActiveAllEqual", "", " -DFUNC_WAVE_ACTIVE_ALL_EQUAL=1") #undef OP diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index 4dd8a76062..df9976e547 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1326,6 +1326,25 @@ template T waveActiveBitXor(T A, UINT) { WAVE_ACTIVE_OP(OpType::WaveActiveBitXor, (waveActiveBitXor(A, WaveSize))); +template +struct Op : StrictValidation {}; + +template struct ExpectedBuilder { + static std::vector + buildExpected(Op &, + const InputSets &Inputs) { + DXASSERT_NOMSG(Inputs.size() == 1); + + std::vector Expected; + const size_t VectorSize = Inputs[0].size(); + Expected.assign(VectorSize - 1, static_cast(true)); + // We set the last element to a different value on a single lane. + Expected[VectorSize - 1] = static_cast(false); + + return Expected; + } +}; + #undef WAVE_ACTIVE_OP // @@ -2219,23 +2238,29 @@ class DxilConf_SM69_Vectorized { HLK_TEST(LoadAndStore_RD_SB_SRV, double); HLK_TEST(LoadAndStore_RD_SB_UAV, double); + HLK_TEST(WaveActiveAllEqual, HLSLBool_t); + HLK_WAVEOP_TEST(WaveActiveSum, int16_t); HLK_WAVEOP_TEST(WaveActiveMin, int16_t); HLK_WAVEOP_TEST(WaveActiveMax, int16_t); HLK_WAVEOP_TEST(WaveActiveProduct, int16_t); + HLK_TEST(WaveActiveAllEqual, int16_t); HLK_WAVEOP_TEST(WaveActiveSum, int32_t); HLK_WAVEOP_TEST(WaveActiveMin, int32_t); HLK_WAVEOP_TEST(WaveActiveMax, int32_t); HLK_WAVEOP_TEST(WaveActiveProduct, int32_t); + HLK_TEST(WaveActiveAllEqual, int32_t); HLK_WAVEOP_TEST(WaveActiveSum, int64_t); HLK_WAVEOP_TEST(WaveActiveMin, int64_t); HLK_WAVEOP_TEST(WaveActiveMax, int64_t); HLK_WAVEOP_TEST(WaveActiveProduct, int64_t); + HLK_TEST(WaveActiveAllEqual, int64_t); HLK_WAVEOP_TEST(WaveActiveSum, uint16_t); HLK_WAVEOP_TEST(WaveActiveMin, uint16_t); HLK_WAVEOP_TEST(WaveActiveMax, uint16_t); HLK_WAVEOP_TEST(WaveActiveProduct, uint16_t); + HLK_TEST(WaveActiveAllEqual, uint16_t); HLK_WAVEOP_TEST(WaveActiveSum, uint32_t); HLK_WAVEOP_TEST(WaveActiveMin, uint32_t); HLK_WAVEOP_TEST(WaveActiveMax, uint32_t); @@ -2244,6 +2269,7 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveBitAnd, uint32_t); HLK_WAVEOP_TEST(WaveActiveBitOr, uint32_t); HLK_WAVEOP_TEST(WaveActiveBitXor, uint32_t); + HLK_TEST(WaveActiveAllEqual, uint32_t); HLK_WAVEOP_TEST(WaveActiveSum, uint64_t); HLK_WAVEOP_TEST(WaveActiveMin, uint64_t); HLK_WAVEOP_TEST(WaveActiveMax, uint64_t); @@ -2251,19 +2277,23 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveBitAnd, uint64_t); HLK_WAVEOP_TEST(WaveActiveBitOr, uint64_t); HLK_WAVEOP_TEST(WaveActiveBitXor, uint64_t); + HLK_TEST(WaveActiveAllEqual, uint64_t); HLK_WAVEOP_TEST(WaveActiveSum, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveMin, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveMax, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveProduct, HLSLHalf_t); + HLK_TEST(WaveActiveAllEqual, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveSum, float); HLK_WAVEOP_TEST(WaveActiveMin, float); HLK_WAVEOP_TEST(WaveActiveMax, float); HLK_WAVEOP_TEST(WaveActiveProduct, float); + HLK_TEST(WaveActiveAllEqual, float); HLK_WAVEOP_TEST(WaveActiveSum, double); HLK_WAVEOP_TEST(WaveActiveMin, double); HLK_WAVEOP_TEST(WaveActiveMax, double); HLK_WAVEOP_TEST(WaveActiveProduct, double); + HLK_TEST(WaveActiveAllEqual, double); private: bool Initialized = false; diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index d3676ed2d5..da676461ef 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4146,6 +4146,18 @@ void MSMain(uint GID : SV_GroupIndex, } #endif + #ifdef FUNC_WAVE_ACTIVE_ALL_EQUAL + vector TestWaveActiveAllEqual(vector Vector) + { + if(WaveGetLaneIndex() == (WaveGetLaneCount() - 1)) + { + Vector[NUM - 1] = (TYPE)1337; + } + + return WaveActiveAllEqual(Vector); + } + #endif + #ifdef FUNC_TEST_SELECT vector TestSelect(vector Vector1, vector Vector2, From 70d00d8b50fa9b0e91509b1903949df127266c51 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Tue, 11 Nov 2025 20:55:52 -0800 Subject: [PATCH 13/28] All on Wave active macro --- .../unittests/HLSLExec/LongVectorOps.def | 2 + .../clang/unittests/HLSLExec/LongVectors.cpp | 89 ++++++++++++++++--- .../unittests/HLSLExec/ShaderOpArith.xml | 31 +++++++ 3 files changed, 110 insertions(+), 12 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectorOps.def b/tools/clang/unittests/HLSLExec/LongVectorOps.def index 9857284cb1..6a19f953e8 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorOps.def +++ b/tools/clang/unittests/HLSLExec/LongVectorOps.def @@ -202,5 +202,7 @@ OP_DEFAULT_DEFINES(Wave, WaveActiveBitAnd, 1, "TestWaveActiveBitAnd", "", " -DFU OP_DEFAULT_DEFINES(Wave, WaveActiveBitOr, 1, "TestWaveActiveBitOr", "", " -DFUNC_WAVE_ACTIVE_BIT_OR=1") OP_DEFAULT_DEFINES(Wave, WaveActiveBitXor, 1, "TestWaveActiveBitXor", "", " -DFUNC_WAVE_ACTIVE_BIT_XOR=1") OP_DEFAULT_DEFINES(Wave, WaveActiveAllEqual, 1, "TestWaveActiveAllEqual", "", " -DFUNC_WAVE_ACTIVE_ALL_EQUAL=1") +OP_DEFAULT_DEFINES(Wave, WaveActiveReadLaneAt, 1, "TestWaveActiveReadLaneAt", "", " -DFUNC_WAVE_ACTIVE_READ_LANE_AT=1") +OP_DEFAULT_DEFINES(Wave, WaveActiveReadLaneFirst, 1, "TestWaveActiveReadLaneFirst", "", " -DFUNC_WAVE_ACTIVE_READ_LANE_FIRST=1") #undef OP diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index df9976e547..52d0e7c09d 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -611,6 +611,7 @@ template struct Op; // ExpectedBuilder - specializations are expected to have buildExpectedData // member functions. template struct ExpectedBuilder; +template struct WaveOpExpectedBuilder; // Default Validation configuration - ULP for floating point types, exact // matches for everything else. @@ -1329,11 +1330,13 @@ WAVE_ACTIVE_OP(OpType::WaveActiveBitXor, (waveActiveBitXor(A, WaveSize))); template struct Op : StrictValidation {}; -template struct ExpectedBuilder { +template +struct WaveOpExpectedBuilder { static std::vector buildExpected(Op &, - const InputSets &Inputs) { + const InputSets &Inputs, UINT WaveSize) { DXASSERT_NOMSG(Inputs.size() == 1); + UNREFERENCED_PARAMETER(WaveSize); std::vector Expected; const size_t VectorSize = Inputs[0].size(); @@ -1345,6 +1348,48 @@ template struct ExpectedBuilder { } }; +template +struct Op : StrictValidation {}; + +template +struct WaveOpExpectedBuilder { + static std::vector buildExpected(Op &, + const InputSets &Inputs, + UINT WaveSize) { + DXASSERT_NOMSG(Inputs.size() == 1); + UNREFERENCED_PARAMETER(WaveSize); + + std::vector Expected; + const size_t VectorSize = Inputs[0].size(); + // Simple test, on the lane that we read we also fill the vector with the + // value of the first element. + Expected.assign(VectorSize, Inputs[0][0]); + + return Expected; + } +}; + +template +struct Op : StrictValidation {}; + +template +struct WaveOpExpectedBuilder { + static std::vector + buildExpected(Op &, + const InputSets &Inputs, UINT WaveSize) { + DXASSERT_NOMSG(Inputs.size() == 1); + UNREFERENCED_PARAMETER(WaveSize); + + std::vector Expected; + const size_t VectorSize = Inputs[0].size(); + // Simple test, on the lane that we read we also fill the vector with the + // value of the first element. + Expected.assign(VectorSize, Inputs[0][0]); + + return Expected; + } +}; + #undef WAVE_ACTIVE_OP // @@ -2238,29 +2283,39 @@ class DxilConf_SM69_Vectorized { HLK_TEST(LoadAndStore_RD_SB_SRV, double); HLK_TEST(LoadAndStore_RD_SB_UAV, double); - HLK_TEST(WaveActiveAllEqual, HLSLBool_t); + HLK_WAVEOP_TEST(WaveActiveAllEqual, HLSLBool_t); + HLK_WAVEOP_TEST(WaveActiveReadLaneAt, HLSLBool_t); + HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, HLSLBool_t); HLK_WAVEOP_TEST(WaveActiveSum, int16_t); HLK_WAVEOP_TEST(WaveActiveMin, int16_t); HLK_WAVEOP_TEST(WaveActiveMax, int16_t); HLK_WAVEOP_TEST(WaveActiveProduct, int16_t); - HLK_TEST(WaveActiveAllEqual, int16_t); + HLK_WAVEOP_TEST(WaveActiveAllEqual, int16_t); + HLK_WAVEOP_TEST(WaveActiveReadLaneAt, int16_t); + HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, int16_t); HLK_WAVEOP_TEST(WaveActiveSum, int32_t); HLK_WAVEOP_TEST(WaveActiveMin, int32_t); HLK_WAVEOP_TEST(WaveActiveMax, int32_t); HLK_WAVEOP_TEST(WaveActiveProduct, int32_t); - HLK_TEST(WaveActiveAllEqual, int32_t); + HLK_WAVEOP_TEST(WaveActiveAllEqual, int32_t); + HLK_WAVEOP_TEST(WaveActiveReadLaneAt, int32_t); + HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, int32_t); HLK_WAVEOP_TEST(WaveActiveSum, int64_t); HLK_WAVEOP_TEST(WaveActiveMin, int64_t); HLK_WAVEOP_TEST(WaveActiveMax, int64_t); HLK_WAVEOP_TEST(WaveActiveProduct, int64_t); - HLK_TEST(WaveActiveAllEqual, int64_t); + HLK_WAVEOP_TEST(WaveActiveAllEqual, int64_t); + HLK_WAVEOP_TEST(WaveActiveReadLaneAt, int64_t); + HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, int64_t); HLK_WAVEOP_TEST(WaveActiveSum, uint16_t); HLK_WAVEOP_TEST(WaveActiveMin, uint16_t); HLK_WAVEOP_TEST(WaveActiveMax, uint16_t); HLK_WAVEOP_TEST(WaveActiveProduct, uint16_t); - HLK_TEST(WaveActiveAllEqual, uint16_t); + HLK_WAVEOP_TEST(WaveActiveAllEqual, uint16_t); + HLK_WAVEOP_TEST(WaveActiveReadLaneAt, uint16_t); + HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, uint16_t); HLK_WAVEOP_TEST(WaveActiveSum, uint32_t); HLK_WAVEOP_TEST(WaveActiveMin, uint32_t); HLK_WAVEOP_TEST(WaveActiveMax, uint32_t); @@ -2269,7 +2324,9 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveBitAnd, uint32_t); HLK_WAVEOP_TEST(WaveActiveBitOr, uint32_t); HLK_WAVEOP_TEST(WaveActiveBitXor, uint32_t); - HLK_TEST(WaveActiveAllEqual, uint32_t); + HLK_WAVEOP_TEST(WaveActiveAllEqual, uint32_t); + HLK_WAVEOP_TEST(WaveActiveReadLaneAt, uint32_t); + HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, uint32_t); HLK_WAVEOP_TEST(WaveActiveSum, uint64_t); HLK_WAVEOP_TEST(WaveActiveMin, uint64_t); HLK_WAVEOP_TEST(WaveActiveMax, uint64_t); @@ -2277,23 +2334,31 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveBitAnd, uint64_t); HLK_WAVEOP_TEST(WaveActiveBitOr, uint64_t); HLK_WAVEOP_TEST(WaveActiveBitXor, uint64_t); - HLK_TEST(WaveActiveAllEqual, uint64_t); + HLK_WAVEOP_TEST(WaveActiveAllEqual, uint64_t); + HLK_WAVEOP_TEST(WaveActiveReadLaneAt, uint64_t); + HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, uint64_t); HLK_WAVEOP_TEST(WaveActiveSum, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveMin, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveMax, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveProduct, HLSLHalf_t); - HLK_TEST(WaveActiveAllEqual, HLSLHalf_t); + HLK_WAVEOP_TEST(WaveActiveAllEqual, HLSLHalf_t); + HLK_WAVEOP_TEST(WaveActiveReadLaneAt, HLSLHalf_t); + HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveSum, float); HLK_WAVEOP_TEST(WaveActiveMin, float); HLK_WAVEOP_TEST(WaveActiveMax, float); HLK_WAVEOP_TEST(WaveActiveProduct, float); - HLK_TEST(WaveActiveAllEqual, float); + HLK_WAVEOP_TEST(WaveActiveAllEqual, float); + HLK_WAVEOP_TEST(WaveActiveReadLaneAt, float); + HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, float); HLK_WAVEOP_TEST(WaveActiveSum, double); HLK_WAVEOP_TEST(WaveActiveMin, double); HLK_WAVEOP_TEST(WaveActiveMax, double); HLK_WAVEOP_TEST(WaveActiveProduct, double); - HLK_TEST(WaveActiveAllEqual, double); + HLK_WAVEOP_TEST(WaveActiveAllEqual, double); + HLK_WAVEOP_TEST(WaveActiveReadLaneAt, double); + HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, double); private: bool Initialized = false; diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index da676461ef..31a568e7c0 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4158,6 +4158,37 @@ void MSMain(uint GID : SV_GroupIndex, } #endif + #ifdef FUNC_WAVE_READ_LANE_AT + vector TestWaveReadLaneAt(vector Vector) + { + // Keep it simple and just read the last lane. + const uint LaneToRead = WaveGetLaneCount() - 1; + if(WaveGetLaneIndex() == LaneToRead) + { + [unroll] + for(uint i = 1; i < NUM; ++i) + { + Vector[i] = Vector[0]; + } + } + return WaveReadLaneAt(Vector, LaneToRead); + } + #endif + + #ifdef FUNC_WAVE_ACTIVE_READ_LANE_FIRST + vector TestWaveActiveReadLaneFirst(vector Vector) + { + if(WaveGetLaneIndex() == 0) + { + [unroll] + for(uint i = 1; i < NUM; ++i) + { + Vector[i] = Vector[0]; + } + } + return WaveActiveReadLaneFirst(Vector); + } + #ifdef FUNC_TEST_SELECT vector TestSelect(vector Vector1, vector Vector2, From 3ee0d086a4402cbec4ec6fca5c423fa336a07f49 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Wed, 12 Nov 2025 09:05:50 -0800 Subject: [PATCH 14/28] Fix namig for read --- .../unittests/HLSLExec/LongVectorOps.def | 4 +- .../clang/unittests/HLSLExec/LongVectors.cpp | 61 +++++++++---------- .../unittests/HLSLExec/ShaderOpArith.xml | 7 ++- 3 files changed, 36 insertions(+), 36 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectorOps.def b/tools/clang/unittests/HLSLExec/LongVectorOps.def index 6a19f953e8..4fa29a4c9b 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorOps.def +++ b/tools/clang/unittests/HLSLExec/LongVectorOps.def @@ -202,7 +202,7 @@ OP_DEFAULT_DEFINES(Wave, WaveActiveBitAnd, 1, "TestWaveActiveBitAnd", "", " -DFU OP_DEFAULT_DEFINES(Wave, WaveActiveBitOr, 1, "TestWaveActiveBitOr", "", " -DFUNC_WAVE_ACTIVE_BIT_OR=1") OP_DEFAULT_DEFINES(Wave, WaveActiveBitXor, 1, "TestWaveActiveBitXor", "", " -DFUNC_WAVE_ACTIVE_BIT_XOR=1") OP_DEFAULT_DEFINES(Wave, WaveActiveAllEqual, 1, "TestWaveActiveAllEqual", "", " -DFUNC_WAVE_ACTIVE_ALL_EQUAL=1") -OP_DEFAULT_DEFINES(Wave, WaveActiveReadLaneAt, 1, "TestWaveActiveReadLaneAt", "", " -DFUNC_WAVE_ACTIVE_READ_LANE_AT=1") -OP_DEFAULT_DEFINES(Wave, WaveActiveReadLaneFirst, 1, "TestWaveActiveReadLaneFirst", "", " -DFUNC_WAVE_ACTIVE_READ_LANE_FIRST=1") +OP_DEFAULT_DEFINES(Wave, WaveReadLaneAt, 1, "TestWaveReadLaneAt", "", " -DFUNC_WAVE_READ_LANE_AT=1") +OP_DEFAULT_DEFINES(Wave, WaveReadLaneFirst, 1, "TestWaveReadLaneFirst", "", " -DFUNC_WAVE_READ_LANE_FIRST=1") #undef OP diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index 52d0e7c09d..39e830663c 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1327,6 +1327,8 @@ template T waveActiveBitXor(T A, UINT) { WAVE_ACTIVE_OP(OpType::WaveActiveBitXor, (waveActiveBitXor(A, WaveSize))); +#undef WAVE_ACTIVE_OP + template struct Op : StrictValidation {}; @@ -1349,11 +1351,10 @@ struct WaveOpExpectedBuilder { }; template -struct Op : StrictValidation {}; +struct Op : StrictValidation {}; -template -struct WaveOpExpectedBuilder { - static std::vector buildExpected(Op &, +template struct WaveOpExpectedBuilder { + static std::vector buildExpected(Op &, const InputSets &Inputs, UINT WaveSize) { DXASSERT_NOMSG(Inputs.size() == 1); @@ -1370,13 +1371,13 @@ struct WaveOpExpectedBuilder { }; template -struct Op : StrictValidation {}; +struct Op : StrictValidation {}; template -struct WaveOpExpectedBuilder { - static std::vector - buildExpected(Op &, - const InputSets &Inputs, UINT WaveSize) { +struct WaveOpExpectedBuilder { + static std::vector buildExpected(Op &, + const InputSets &Inputs, + UINT WaveSize) { DXASSERT_NOMSG(Inputs.size() == 1); UNREFERENCED_PARAMETER(WaveSize); @@ -1390,8 +1391,6 @@ struct WaveOpExpectedBuilder { } }; -#undef WAVE_ACTIVE_OP - // // dispatchTest // @@ -2284,38 +2283,38 @@ class DxilConf_SM69_Vectorized { HLK_TEST(LoadAndStore_RD_SB_UAV, double); HLK_WAVEOP_TEST(WaveActiveAllEqual, HLSLBool_t); - HLK_WAVEOP_TEST(WaveActiveReadLaneAt, HLSLBool_t); - HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, HLSLBool_t); + HLK_WAVEOP_TEST(WaveReadLaneAt, HLSLBool_t); + HLK_WAVEOP_TEST(WaveReadLaneFirst, HLSLBool_t); HLK_WAVEOP_TEST(WaveActiveSum, int16_t); HLK_WAVEOP_TEST(WaveActiveMin, int16_t); HLK_WAVEOP_TEST(WaveActiveMax, int16_t); HLK_WAVEOP_TEST(WaveActiveProduct, int16_t); HLK_WAVEOP_TEST(WaveActiveAllEqual, int16_t); - HLK_WAVEOP_TEST(WaveActiveReadLaneAt, int16_t); - HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, int16_t); + HLK_WAVEOP_TEST(WaveReadLaneAt, int16_t); + HLK_WAVEOP_TEST(WaveReadLaneFirst, int16_t); HLK_WAVEOP_TEST(WaveActiveSum, int32_t); HLK_WAVEOP_TEST(WaveActiveMin, int32_t); HLK_WAVEOP_TEST(WaveActiveMax, int32_t); HLK_WAVEOP_TEST(WaveActiveProduct, int32_t); HLK_WAVEOP_TEST(WaveActiveAllEqual, int32_t); - HLK_WAVEOP_TEST(WaveActiveReadLaneAt, int32_t); - HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, int32_t); + HLK_WAVEOP_TEST(WaveReadLaneAt, int32_t); + HLK_WAVEOP_TEST(WaveReadLaneFirst, int32_t); HLK_WAVEOP_TEST(WaveActiveSum, int64_t); HLK_WAVEOP_TEST(WaveActiveMin, int64_t); HLK_WAVEOP_TEST(WaveActiveMax, int64_t); HLK_WAVEOP_TEST(WaveActiveProduct, int64_t); HLK_WAVEOP_TEST(WaveActiveAllEqual, int64_t); - HLK_WAVEOP_TEST(WaveActiveReadLaneAt, int64_t); - HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, int64_t); + HLK_WAVEOP_TEST(WaveReadLaneAt, int64_t); + HLK_WAVEOP_TEST(WaveReadLaneFirst, int64_t); HLK_WAVEOP_TEST(WaveActiveSum, uint16_t); HLK_WAVEOP_TEST(WaveActiveMin, uint16_t); HLK_WAVEOP_TEST(WaveActiveMax, uint16_t); HLK_WAVEOP_TEST(WaveActiveProduct, uint16_t); HLK_WAVEOP_TEST(WaveActiveAllEqual, uint16_t); - HLK_WAVEOP_TEST(WaveActiveReadLaneAt, uint16_t); - HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, uint16_t); + HLK_WAVEOP_TEST(WaveReadLaneAt, uint16_t); + HLK_WAVEOP_TEST(WaveReadLaneFirst, uint16_t); HLK_WAVEOP_TEST(WaveActiveSum, uint32_t); HLK_WAVEOP_TEST(WaveActiveMin, uint32_t); HLK_WAVEOP_TEST(WaveActiveMax, uint32_t); @@ -2325,8 +2324,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveBitOr, uint32_t); HLK_WAVEOP_TEST(WaveActiveBitXor, uint32_t); HLK_WAVEOP_TEST(WaveActiveAllEqual, uint32_t); - HLK_WAVEOP_TEST(WaveActiveReadLaneAt, uint32_t); - HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, uint32_t); + HLK_WAVEOP_TEST(WaveReadLaneAt, uint32_t); + HLK_WAVEOP_TEST(WaveReadLaneFirst, uint32_t); HLK_WAVEOP_TEST(WaveActiveSum, uint64_t); HLK_WAVEOP_TEST(WaveActiveMin, uint64_t); HLK_WAVEOP_TEST(WaveActiveMax, uint64_t); @@ -2335,30 +2334,30 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveBitOr, uint64_t); HLK_WAVEOP_TEST(WaveActiveBitXor, uint64_t); HLK_WAVEOP_TEST(WaveActiveAllEqual, uint64_t); - HLK_WAVEOP_TEST(WaveActiveReadLaneAt, uint64_t); - HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, uint64_t); + HLK_WAVEOP_TEST(WaveReadLaneAt, uint64_t); + HLK_WAVEOP_TEST(WaveReadLaneFirst, uint64_t); HLK_WAVEOP_TEST(WaveActiveSum, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveMin, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveMax, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveProduct, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveAllEqual, HLSLHalf_t); - HLK_WAVEOP_TEST(WaveActiveReadLaneAt, HLSLHalf_t); - HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, HLSLHalf_t); + HLK_WAVEOP_TEST(WaveReadLaneAt, HLSLHalf_t); + HLK_WAVEOP_TEST(WaveReadLaneFirst, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveSum, float); HLK_WAVEOP_TEST(WaveActiveMin, float); HLK_WAVEOP_TEST(WaveActiveMax, float); HLK_WAVEOP_TEST(WaveActiveProduct, float); HLK_WAVEOP_TEST(WaveActiveAllEqual, float); - HLK_WAVEOP_TEST(WaveActiveReadLaneAt, float); - HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, float); + HLK_WAVEOP_TEST(WaveReadLaneAt, float); + HLK_WAVEOP_TEST(WaveReadLaneFirst, float); HLK_WAVEOP_TEST(WaveActiveSum, double); HLK_WAVEOP_TEST(WaveActiveMin, double); HLK_WAVEOP_TEST(WaveActiveMax, double); HLK_WAVEOP_TEST(WaveActiveProduct, double); HLK_WAVEOP_TEST(WaveActiveAllEqual, double); - HLK_WAVEOP_TEST(WaveActiveReadLaneAt, double); - HLK_WAVEOP_TEST(WaveActiveReadLaneFirst, double); + HLK_WAVEOP_TEST(WaveReadLaneAt, double); + HLK_WAVEOP_TEST(WaveReadLaneFirst, double); private: bool Initialized = false; diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index 31a568e7c0..36bf99c0c1 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4175,8 +4175,8 @@ void MSMain(uint GID : SV_GroupIndex, } #endif - #ifdef FUNC_WAVE_ACTIVE_READ_LANE_FIRST - vector TestWaveActiveReadLaneFirst(vector Vector) + #ifdef FUNC_WAVE_READ_LANE_FIRST + vector TestWaveReadLaneFirst(vector Vector) { if(WaveGetLaneIndex() == 0) { @@ -4186,8 +4186,9 @@ void MSMain(uint GID : SV_GroupIndex, Vector[i] = Vector[0]; } } - return WaveActiveReadLaneFirst(Vector); + return WaveReadLaneFirst(Vector); } + #endif #ifdef FUNC_TEST_SELECT vector TestSelect(vector Vector1, From 916f87887be98357932e4ba474c68bcaa4579641 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Wed, 12 Nov 2025 17:28:31 -0800 Subject: [PATCH 15/28] Cleanup. Remove WaveCountBits --- .../unittests/HLSLExec/LongVectorOps.def | 2 + .../clang/unittests/HLSLExec/LongVectors.cpp | 49 +++++++++++++++---- .../unittests/HLSLExec/ShaderOpArith.xml | 40 ++++++++++++--- 3 files changed, 76 insertions(+), 15 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectorOps.def b/tools/clang/unittests/HLSLExec/LongVectorOps.def index 4fa29a4c9b..c78039c86f 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorOps.def +++ b/tools/clang/unittests/HLSLExec/LongVectorOps.def @@ -204,5 +204,7 @@ OP_DEFAULT_DEFINES(Wave, WaveActiveBitXor, 1, "TestWaveActiveBitXor", "", " -DFU OP_DEFAULT_DEFINES(Wave, WaveActiveAllEqual, 1, "TestWaveActiveAllEqual", "", " -DFUNC_WAVE_ACTIVE_ALL_EQUAL=1") OP_DEFAULT_DEFINES(Wave, WaveReadLaneAt, 1, "TestWaveReadLaneAt", "", " -DFUNC_WAVE_READ_LANE_AT=1") OP_DEFAULT_DEFINES(Wave, WaveReadLaneFirst, 1, "TestWaveReadLaneFirst", "", " -DFUNC_WAVE_READ_LANE_FIRST=1") +OP_DEFAULT_DEFINES(Wave, WavePrefixSum, 1, "TestWavePrefixSum", "", " -DFUNC_WAVE_PREFIX_SUM=1 -DIS_WAVE_PREFIX_OP=1") +OP_DEFAULT_DEFINES(Wave, WavePrefixProduct, 1, "TestWavePrefixProduct", "", " -DFUNC_WAVE_PREFIX_PRODUCT=1 -DIS_WAVE_PREFIX_OP=1") #undef OP diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index 39e830663c..353485d9d6 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1265,7 +1265,7 @@ FLOAT_SPECIAL_OP(OpType::IsNan, (std::isnan(A))); // Wave Ops // -#define WAVE_ACTIVE_OP(OP, IMPL) \ +#define WAVE_OP(OP, IMPL) \ template struct Op : DefaultValidation { \ T operator()(T A, UINT WaveSize) { return IMPL; } \ }; @@ -1275,7 +1275,7 @@ template T waveActiveSum(T A, UINT WaveSize) { return A * WaveSizeT; } -WAVE_ACTIVE_OP(OpType::WaveActiveSum, (waveActiveSum(A, WaveSize))); +WAVE_OP(OpType::WaveActiveSum, (waveActiveSum(A, WaveSize))); template T waveActiveMin(T A, UINT WaveSize) { std::vector Values; @@ -1285,7 +1285,7 @@ template T waveActiveMin(T A, UINT WaveSize) { return *std::min_element(Values.begin(), Values.end()); } -WAVE_ACTIVE_OP(OpType::WaveActiveMin, (waveActiveMin(A, WaveSize))); +WAVE_OP(OpType::WaveActiveMin, (waveActiveMin(A, WaveSize))); template T waveActiveMax(T A, UINT WaveSize) { std::vector Values; @@ -1295,7 +1295,7 @@ template T waveActiveMax(T A, UINT WaveSize) { return *std::max_element(Values.begin(), Values.end()); } -WAVE_ACTIVE_OP(OpType::WaveActiveMax, (waveActiveMax(A, WaveSize))); +WAVE_OP(OpType::WaveActiveMax, (waveActiveMax(A, WaveSize))); template T waveActiveProduct(T A, UINT WaveSize) { // We want to avoid overflow of a large product. So, the WaveActiveProdFn has @@ -1304,30 +1304,29 @@ template T waveActiveProduct(T A, UINT WaveSize) { return A * static_cast(WaveSize - 1); } -WAVE_ACTIVE_OP(OpType::WaveActiveProduct, (waveActiveProduct(A, WaveSize))); +WAVE_OP(OpType::WaveActiveProduct, (waveActiveProduct(A, WaveSize))); template T waveActiveBitAnd(T A, UINT) { // We set the LSB to 0 in one of the lanes. return static_cast(A & ~static_cast(1)); } -WAVE_ACTIVE_OP(OpType::WaveActiveBitAnd, (waveActiveBitAnd(A, WaveSize))); +WAVE_OP(OpType::WaveActiveBitAnd, (waveActiveBitAnd(A, WaveSize))); template T waveActiveBitOr(T A, UINT) { // We set the LSB to 0 in one of the lanes. return static_cast(A | static_cast(1)); } -WAVE_ACTIVE_OP(OpType::WaveActiveBitOr, (waveActiveBitOr(A, WaveSize))); +WAVE_OP(OpType::WaveActiveBitOr, (waveActiveBitOr(A, WaveSize))); template T waveActiveBitXor(T A, UINT) { // We clear the LSB in every lane except the last lane which sets it to 1. return static_cast(A | static_cast(1)); } -WAVE_ACTIVE_OP(OpType::WaveActiveBitXor, (waveActiveBitXor(A, WaveSize))); +WAVE_OP(OpType::WaveActiveBitXor, (waveActiveBitXor(A, WaveSize))); -#undef WAVE_ACTIVE_OP template struct Op : StrictValidation {}; @@ -1391,6 +1390,20 @@ struct WaveOpExpectedBuilder { } }; +WAVE_OP(OpType::WavePrefixSum, (wavePrefixSum(A, WaveSize))); + +template T wavePrefixSum(T A, UINT WaveSize) { + return static_cast(A * static_cast(WaveSize/2)); +} + +WAVE_OP(OpType::WavePrefixProduct, (wavePrefixProduct(A, WaveSize))); + +template T wavePrefixProduct(T A, UINT) { + return static_cast(A * 2); +} + +#undef WAVE_OP + // // dispatchTest // @@ -2293,6 +2306,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveAllEqual, int16_t); HLK_WAVEOP_TEST(WaveReadLaneAt, int16_t); HLK_WAVEOP_TEST(WaveReadLaneFirst, int16_t); + HLK_WAVEOP_TEST(WavePrefixSum, int16_t); + HLK_WAVEOP_TEST(WavePrefixProduct, int16_t); HLK_WAVEOP_TEST(WaveActiveSum, int32_t); HLK_WAVEOP_TEST(WaveActiveMin, int32_t); HLK_WAVEOP_TEST(WaveActiveMax, int32_t); @@ -2300,6 +2315,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveAllEqual, int32_t); HLK_WAVEOP_TEST(WaveReadLaneAt, int32_t); HLK_WAVEOP_TEST(WaveReadLaneFirst, int32_t); + HLK_WAVEOP_TEST(WavePrefixSum, int32_t); + HLK_WAVEOP_TEST(WavePrefixProduct, int32_t); HLK_WAVEOP_TEST(WaveActiveSum, int64_t); HLK_WAVEOP_TEST(WaveActiveMin, int64_t); HLK_WAVEOP_TEST(WaveActiveMax, int64_t); @@ -2307,6 +2324,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveAllEqual, int64_t); HLK_WAVEOP_TEST(WaveReadLaneAt, int64_t); HLK_WAVEOP_TEST(WaveReadLaneFirst, int64_t); + HLK_WAVEOP_TEST(WavePrefixSum, int64_t); + HLK_WAVEOP_TEST(WavePrefixProduct, int64_t); HLK_WAVEOP_TEST(WaveActiveSum, uint16_t); HLK_WAVEOP_TEST(WaveActiveMin, uint16_t); @@ -2315,6 +2334,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveAllEqual, uint16_t); HLK_WAVEOP_TEST(WaveReadLaneAt, uint16_t); HLK_WAVEOP_TEST(WaveReadLaneFirst, uint16_t); + HLK_WAVEOP_TEST(WavePrefixSum, uint16_t); + HLK_WAVEOP_TEST(WavePrefixProduct, uint16_t); HLK_WAVEOP_TEST(WaveActiveSum, uint32_t); HLK_WAVEOP_TEST(WaveActiveMin, uint32_t); HLK_WAVEOP_TEST(WaveActiveMax, uint32_t); @@ -2326,6 +2347,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveAllEqual, uint32_t); HLK_WAVEOP_TEST(WaveReadLaneAt, uint32_t); HLK_WAVEOP_TEST(WaveReadLaneFirst, uint32_t); + HLK_WAVEOP_TEST(WavePrefixSum, uint32_t); + HLK_WAVEOP_TEST(WavePrefixProduct, uint32_t); HLK_WAVEOP_TEST(WaveActiveSum, uint64_t); HLK_WAVEOP_TEST(WaveActiveMin, uint64_t); HLK_WAVEOP_TEST(WaveActiveMax, uint64_t); @@ -2336,6 +2359,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveAllEqual, uint64_t); HLK_WAVEOP_TEST(WaveReadLaneAt, uint64_t); HLK_WAVEOP_TEST(WaveReadLaneFirst, uint64_t); + HLK_WAVEOP_TEST(WavePrefixSum, uint64_t); + HLK_WAVEOP_TEST(WavePrefixProduct, uint64_t); HLK_WAVEOP_TEST(WaveActiveSum, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveMin, HLSLHalf_t); @@ -2344,6 +2369,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveAllEqual, HLSLHalf_t); HLK_WAVEOP_TEST(WaveReadLaneAt, HLSLHalf_t); HLK_WAVEOP_TEST(WaveReadLaneFirst, HLSLHalf_t); + HLK_WAVEOP_TEST(WavePrefixSum, HLSLHalf_t); + HLK_WAVEOP_TEST(WavePrefixProduct, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveSum, float); HLK_WAVEOP_TEST(WaveActiveMin, float); HLK_WAVEOP_TEST(WaveActiveMax, float); @@ -2351,6 +2378,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveAllEqual, float); HLK_WAVEOP_TEST(WaveReadLaneAt, float); HLK_WAVEOP_TEST(WaveReadLaneFirst, float); + HLK_WAVEOP_TEST(WavePrefixSum, float); + HLK_WAVEOP_TEST(WavePrefixProduct, float); HLK_WAVEOP_TEST(WaveActiveSum, double); HLK_WAVEOP_TEST(WaveActiveMin, double); HLK_WAVEOP_TEST(WaveActiveMax, double); @@ -2358,6 +2387,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveAllEqual, double); HLK_WAVEOP_TEST(WaveReadLaneAt, double); HLK_WAVEOP_TEST(WaveReadLaneFirst, double); + HLK_WAVEOP_TEST(WavePrefixSum, double); + HLK_WAVEOP_TEST(WavePrefixProduct, double); private: bool Initialized = false; diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index 36bf99c0c1..76e9fe610e 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4190,6 +4190,31 @@ void MSMain(uint GID : SV_GroupIndex, } #endif + #ifdef FUNC_WAVE_PREFIX_SUM + void TestWavePrefixSum(vector Vector) + { + const uint LaneCount = WaveGetLaneCount(); + const uint MidLane = LaneCount/2; + + if(WaveGetLaneIndex() == MidLane) + { + Vector = WavePrefixSum(Vector); + g_OutputVector.Store< vector >(0, Vector); + } + } + #endif + + #ifdef FUNC_WAVE_PREFIX_PRODUCT + void TestWavePrefixProduct(vector Vector) + { + if(WaveGetLaneIndex() == 1) + { + Vector = WavePrefixProduct(Vector); + g_OutputVector.Store< vector >(0, Vector); + } + } + #endif + #ifdef FUNC_TEST_SELECT vector TestSelect(vector Vector1, vector Vector2, @@ -4249,16 +4274,19 @@ void MSMain(uint GID : SV_GroupIndex, const uint32_t OutNum = NUM; #endif - #if IS_UNARY_OP - vector OutputVector = FUNC(Input1); + vector OutputVector; + #ifdef IS_WAVE_PREFIX_OP + // Wave prefix ops store the output on a specific lane only. + FUNC(Input1); + return; + #elif IS_UNARY_OP + OutputVector = FUNC(Input1); #elif IS_BINARY_OP - vector OutputVector = FUNC(Input1 OPERATOR - Input2); + OutputVector = FUNC(Input1 OPERATOR Input2); #elif IS_TERNARY_OP // Ternary ops don't bother expanding OPERATOR because its // always going to be comma for these test cases. - vector OutputVector = FUNC(Input1, Input2, - Input3); + OutputVector = FUNC(Input1, Input2, Input3); #endif g_OutputVector.Store< vector >(0, OutputVector); From 470e9af00e81168a7ca5163d16de44ecf5bc793e Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Wed, 12 Nov 2025 18:23:04 -0800 Subject: [PATCH 16/28] Fix the prefix ops --- tools/clang/unittests/HLSLExec/LongVectors.cpp | 11 ++++------- tools/clang/unittests/HLSLExec/ShaderOpArith.xml | 6 +++--- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index 353485d9d6..ca83087fb6 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1335,9 +1335,8 @@ template struct WaveOpExpectedBuilder { static std::vector buildExpected(Op &, - const InputSets &Inputs, UINT WaveSize) { + const InputSets &Inputs, UINT) { DXASSERT_NOMSG(Inputs.size() == 1); - UNREFERENCED_PARAMETER(WaveSize); std::vector Expected; const size_t VectorSize = Inputs[0].size(); @@ -1355,9 +1354,8 @@ struct Op : StrictValidation {}; template struct WaveOpExpectedBuilder { static std::vector buildExpected(Op &, const InputSets &Inputs, - UINT WaveSize) { + UINT) { DXASSERT_NOMSG(Inputs.size() == 1); - UNREFERENCED_PARAMETER(WaveSize); std::vector Expected; const size_t VectorSize = Inputs[0].size(); @@ -1376,9 +1374,8 @@ template struct WaveOpExpectedBuilder { static std::vector buildExpected(Op &, const InputSets &Inputs, - UINT WaveSize) { + UINT) { DXASSERT_NOMSG(Inputs.size() == 1); - UNREFERENCED_PARAMETER(WaveSize); std::vector Expected; const size_t VectorSize = Inputs[0].size(); @@ -1399,7 +1396,7 @@ template T wavePrefixSum(T A, UINT WaveSize) { WAVE_OP(OpType::WavePrefixProduct, (wavePrefixProduct(A, WaveSize))); template T wavePrefixProduct(T A, UINT) { - return static_cast(A * 2); + return static_cast(A * A); } #undef WAVE_OP diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index 76e9fe610e..64e7833be3 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4196,9 +4196,9 @@ void MSMain(uint GID : SV_GroupIndex, const uint LaneCount = WaveGetLaneCount(); const uint MidLane = LaneCount/2; + Vector = WavePrefixSum(Vector); if(WaveGetLaneIndex() == MidLane) { - Vector = WavePrefixSum(Vector); g_OutputVector.Store< vector >(0, Vector); } } @@ -4207,9 +4207,9 @@ void MSMain(uint GID : SV_GroupIndex, #ifdef FUNC_WAVE_PREFIX_PRODUCT void TestWavePrefixProduct(vector Vector) { - if(WaveGetLaneIndex() == 1) + Vector = WavePrefixProduct(Vector); + if(WaveGetLaneIndex() == 2) { - Vector = WavePrefixProduct(Vector); g_OutputVector.Store< vector >(0, Vector); } } From 842e639125a81dbc96d2659c6d45af29485ad7ce Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Wed, 12 Nov 2025 18:35:25 -0800 Subject: [PATCH 17/28] Comments --- tools/clang/unittests/HLSLExec/LongVectors.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index ca83087fb6..b28c3352d8 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1390,12 +1390,15 @@ struct WaveOpExpectedBuilder { WAVE_OP(OpType::WavePrefixSum, (wavePrefixSum(A, WaveSize))); template T wavePrefixSum(T A, UINT WaveSize) { + // We test the prefix sume in the 'middle' lane. This choice is arbitrary. return static_cast(A * static_cast(WaveSize/2)); } WAVE_OP(OpType::WavePrefixProduct, (wavePrefixProduct(A, WaveSize))); template T wavePrefixProduct(T A, UINT) { + // We test the the prefix product in the 3rd lane to avoid overflow issues. + // So the result is A * A. return static_cast(A * A); } From eefa04e15a267277e1720992247fd5f1f95c6105 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Wed, 12 Nov 2025 18:35:38 -0800 Subject: [PATCH 18/28] Clang format --- tools/clang/unittests/HLSLExec/LongVectors.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index b28c3352d8..c26e3627be 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1265,7 +1265,7 @@ FLOAT_SPECIAL_OP(OpType::IsNan, (std::isnan(A))); // Wave Ops // -#define WAVE_OP(OP, IMPL) \ +#define WAVE_OP(OP, IMPL) \ template struct Op : DefaultValidation { \ T operator()(T A, UINT WaveSize) { return IMPL; } \ }; @@ -1327,7 +1327,6 @@ template T waveActiveBitXor(T A, UINT) { WAVE_OP(OpType::WaveActiveBitXor, (waveActiveBitXor(A, WaveSize))); - template struct Op : StrictValidation {}; @@ -1353,8 +1352,7 @@ struct Op : StrictValidation {}; template struct WaveOpExpectedBuilder { static std::vector buildExpected(Op &, - const InputSets &Inputs, - UINT) { + const InputSets &Inputs, UINT) { DXASSERT_NOMSG(Inputs.size() == 1); std::vector Expected; @@ -1373,8 +1371,7 @@ struct Op : StrictValidation {}; template struct WaveOpExpectedBuilder { static std::vector buildExpected(Op &, - const InputSets &Inputs, - UINT) { + const InputSets &Inputs, UINT) { DXASSERT_NOMSG(Inputs.size() == 1); std::vector Expected; @@ -1391,7 +1388,7 @@ WAVE_OP(OpType::WavePrefixSum, (wavePrefixSum(A, WaveSize))); template T wavePrefixSum(T A, UINT WaveSize) { // We test the prefix sume in the 'middle' lane. This choice is arbitrary. - return static_cast(A * static_cast(WaveSize/2)); + return static_cast(A * static_cast(WaveSize / 2)); } WAVE_OP(OpType::WavePrefixProduct, (wavePrefixProduct(A, WaveSize))); From 1939982d2a95c22d442ec62e14919cab343a9b69 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Wed, 12 Nov 2025 18:56:06 -0800 Subject: [PATCH 19/28] Actually fix merge conflict --- tools/clang/unittests/HLSLExec/LongVectorOps.def | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectorOps.def b/tools/clang/unittests/HLSLExec/LongVectorOps.def index 835895e879..321985e894 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorOps.def +++ b/tools/clang/unittests/HLSLExec/LongVectorOps.def @@ -19,11 +19,7 @@ INPUT_SET(Positive) INPUT_SET(Bitwise) INPUT_SET(SelectCond) INPUT_SET(FloatSpecial) -<<<<<<< HEAD INPUT_SET(AllScalarOnes) -======= -INPUT_SET(AllOnes) ->>>>>>> main #undef INPUT_SET From 4e6f6eba1307e38f24235ab3cee70ceb9deee234 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Thu, 13 Nov 2025 15:13:09 -0800 Subject: [PATCH 20/28] Fix input set name --- tools/clang/unittests/HLSLExec/LongVectorOps.def | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectorOps.def b/tools/clang/unittests/HLSLExec/LongVectorOps.def index 321985e894..f3908ff055 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorOps.def +++ b/tools/clang/unittests/HLSLExec/LongVectorOps.def @@ -19,7 +19,7 @@ INPUT_SET(Positive) INPUT_SET(Bitwise) INPUT_SET(SelectCond) INPUT_SET(FloatSpecial) -INPUT_SET(AllScalarOnes) +INPUT_SET(AllOnes) #undef INPUT_SET @@ -198,7 +198,7 @@ OP_LOAD_AND_STORE_SB(LoadAndStore_RD_SB_SRV, "RootDescriptor_SRV") OP_DEFAULT(Wave, WaveActiveSum, 1, "WaveActiveSum", "") OP_DEFAULT_DEFINES(Wave, WaveActiveMin, 1, "TestWaveActiveMin", "", " -DFUNC_WAVE_ACTIVE_MIN=1") OP_DEFAULT_DEFINES(Wave, WaveActiveMax, 1, "TestWaveActiveMax", "", " -DFUNC_WAVE_ACTIVE_MAX=1") -OP(Wave, WaveActiveProduct, 1, "TestWaveActiveProduct", "", " -DFUNC_WAVE_ACTIVE_PRODUCT=1", "LongVectorOp", AllScalarOnes, Default2, Default3) +OP(Wave, WaveActiveProduct, 1, "TestWaveActiveProduct", "", " -DFUNC_WAVE_ACTIVE_PRODUCT=1", "LongVectorOp", AllOnes, Default2, Default3) OP_DEFAULT_DEFINES(Wave, WaveActiveBitAnd, 1, "TestWaveActiveBitAnd", "", " -DFUNC_WAVE_ACTIVE_BIT_AND=1") OP_DEFAULT_DEFINES(Wave, WaveActiveBitOr, 1, "TestWaveActiveBitOr", "", " -DFUNC_WAVE_ACTIVE_BIT_OR=1") OP_DEFAULT_DEFINES(Wave, WaveActiveBitXor, 1, "TestWaveActiveBitXor", "", " -DFUNC_WAVE_ACTIVE_BIT_XOR=1") From 87bc68f51b54c9922e88a23a2897609ad0f4ab07 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Thu, 13 Nov 2025 16:59:47 -0800 Subject: [PATCH 21/28] MultiWavePrefixBitAnd --- .../unittests/HLSLExec/LongVectorOps.def | 2 ++ .../unittests/HLSLExec/LongVectorTestData.h | 24 +++++++++------ .../clang/unittests/HLSLExec/LongVectors.cpp | 30 ++++++++++++------- .../unittests/HLSLExec/ShaderOpArith.xml | 27 +++++++++++++++++ 4 files changed, 63 insertions(+), 20 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectorOps.def b/tools/clang/unittests/HLSLExec/LongVectorOps.def index f3908ff055..7b5e69be1f 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorOps.def +++ b/tools/clang/unittests/HLSLExec/LongVectorOps.def @@ -20,6 +20,7 @@ INPUT_SET(Bitwise) INPUT_SET(SelectCond) INPUT_SET(FloatSpecial) INPUT_SET(AllOnes) +INPUT_SET(WaveMultiPrefixBitwise) #undef INPUT_SET @@ -207,5 +208,6 @@ OP_DEFAULT_DEFINES(Wave, WaveReadLaneAt, 1, "TestWaveReadLaneAt", "", " -DFUNC_W OP_DEFAULT_DEFINES(Wave, WaveReadLaneFirst, 1, "TestWaveReadLaneFirst", "", " -DFUNC_WAVE_READ_LANE_FIRST=1") OP_DEFAULT_DEFINES(Wave, WavePrefixSum, 1, "TestWavePrefixSum", "", " -DFUNC_WAVE_PREFIX_SUM=1 -DIS_WAVE_PREFIX_OP=1") OP_DEFAULT_DEFINES(Wave, WavePrefixProduct, 1, "TestWavePrefixProduct", "", " -DFUNC_WAVE_PREFIX_PRODUCT=1 -DIS_WAVE_PREFIX_OP=1") +OP(Wave, WaveMultiPrefixBitAnd, 1, "TestWaveMultiPrefixBitAnd", "", " -DFUNC_WAVE_MULTI_PREFIX_BIT_AND=1 -DIS_WAVE_PREFIX_OP=1", "LongVectorOp", WaveMultiPrefixBitwise, Default2, Default3) #undef OP diff --git a/tools/clang/unittests/HLSLExec/LongVectorTestData.h b/tools/clang/unittests/HLSLExec/LongVectorTestData.h index 35144173ba..d95b5553f9 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorTestData.h +++ b/tools/clang/unittests/HLSLExec/LongVectorTestData.h @@ -289,7 +289,8 @@ INPUT_SET(InputSet::Bitwise, std::numeric_limits::min(), -1, 0, 1, 3, 6, 9, 0x5555, static_cast(0xAAAA), std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); -INPUT_SET(InputSet::AllScalarOnes, 1); +INPUT_SET(InputSet::AllOnes, 1); +INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0x10, 0x12, 0xF, static_cast(0xFFFF)); END_INPUT_SETS() BEGIN_INPUT_SETS(int32_t) @@ -303,7 +304,8 @@ INPUT_SET(InputSet::Bitwise, std::numeric_limits::min(), -1, 0, 1, 3, 6, 9, 0x55555555, static_cast(0xAAAAAAAA), std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); -INPUT_SET(InputSet::AllScalarOnes, 1); +INPUT_SET(InputSet::AllOnes, 1); +INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0x10, 0x12, 0xF, static_cast(0xFFFFFFFF)); END_INPUT_SETS() BEGIN_INPUT_SETS(int64_t) @@ -317,7 +319,8 @@ INPUT_SET(InputSet::Bitwise, std::numeric_limits::min(), -1, 0, 1, 3, 6, 9, 0x5555555555555555LL, 0xAAAAAAAAAAAAAAAALL, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); -INPUT_SET(InputSet::AllScalarOnes, 1); +INPUT_SET(InputSet::AllOnes, 1); +INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0x10, 0x12, 0xF, 0xFFFFFFFFFFFFFFFFLL); END_INPUT_SETS() BEGIN_INPUT_SETS(uint16_t) @@ -328,7 +331,8 @@ INPUT_SET(InputSet::BitShiftRhs, 1, 6, 3, 0, 9, 3, 12, 13, 14, 15); INPUT_SET(InputSet::Bitwise, 0, 1, 3, 6, 9, 0x5555, 0xAAAA, 0x8000, 127, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); -INPUT_SET(InputSet::AllScalarOnes, 1); +INPUT_SET(InputSet::AllOnes, 1); +INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0x10, 0x12, 0xF, static_cast(0xFFFF)); END_INPUT_SETS() BEGIN_INPUT_SETS(uint32_t) @@ -339,7 +343,8 @@ INPUT_SET(InputSet::BitShiftRhs, 1, 6, 3, 0, 9, 3, 30, 31, 32); INPUT_SET(InputSet::Bitwise, 0, 1, 3, 6, 9, 0x55555555, 0xAAAAAAAA, 0x80000000, 127, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); -INPUT_SET(InputSet::AllScalarOnes, 1); +INPUT_SET(InputSet::AllOnes, 1); +INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0xA, 0xC, 0xF, 0xFFFFFFFF); END_INPUT_SETS() BEGIN_INPUT_SETS(uint64_t) @@ -351,7 +356,8 @@ INPUT_SET(InputSet::Bitwise, 0, 1, 3, 6, 9, 0x5555555555555555, 0xAAAAAAAAAAAAAAAA, 0x8000000000000000, 127, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); -INPUT_SET(InputSet::AllScalarOnes, 1); +INPUT_SET(InputSet::AllOnes, 1); +INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0xA, 0xC, 0xF, 0xFFFFFFFFFFFFFFFF); END_INPUT_SETS() BEGIN_INPUT_SETS(HLSLHalf_t) @@ -382,7 +388,7 @@ INPUT_SET(InputSet::FloatSpecial, std::numeric_limits::infinity(), -std::numeric_limits::max(), std::numeric_limits::denorm_min(), std::numeric_limits::denorm_min() * 10.0, 1.0 / 3.0); -INPUT_SET(InputSet::AllScalarOnes, 1.0); +INPUT_SET(InputSet::AllOnes, 1.0); END_INPUT_SETS() BEGIN_INPUT_SETS(float) @@ -410,7 +416,7 @@ INPUT_SET(InputSet::FloatSpecial, std::numeric_limits::infinity(), -std::numeric_limits::max(), std::numeric_limits::denorm_min(), std::numeric_limits::denorm_min() * 10.0f, 1.0f / 3.0f); -INPUT_SET(InputSet::AllScalarOnes, 1.0f); +INPUT_SET(InputSet::AllOnes, 1.0f); END_INPUT_SETS() BEGIN_INPUT_SETS(double) @@ -429,7 +435,7 @@ INPUT_SET(InputSet::SplitDouble, 0.0, -1.0, 1.0, -1.0, 12345678.87654321, -1.0, INPUT_SET(InputSet::Positive, 1.0, 1.0, 65535.0, 0.01, 5531.0, 0.01, 1.0, 0.01, 331.2330, 3250.01); INPUT_SET(InputSet::SelectCond, 0.0, 1.0); -INPUT_SET(InputSet::AllScalarOnes, 1.0); +INPUT_SET(InputSet::AllOnes, 1.0); END_INPUT_SETS() #undef BEGIN_INPUT_SETS diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index ffe968811d..1c7562e5f4 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -612,7 +612,6 @@ template struct Op; // ExpectedBuilder - specializations are expected to have buildExpectedData // member functions. template struct ExpectedBuilder; -template struct WaveOpExpectedBuilder; // Default Validation configuration - ULP for floating point types, exact // matches for everything else. @@ -1363,11 +1362,18 @@ template T waveActiveBitXor(T A, UINT) { WAVE_OP(OpType::WaveActiveBitXor, (waveActiveBitXor(A, WaveSize))); +WAVE_OP(OpType::WaveMultiPrefixBitAnd, waveMultiPrefixBitAnd(A, WaveSize)); + +template T waveMultiPrefixBitAnd(T A, UINT) { + // All lanes in the group mask use a mask to filter for only the second and + // third LSBs. + return static_cast(A & static_cast(0x6)); +} + template struct Op : StrictValidation {}; -template -struct WaveOpExpectedBuilder { +template struct ExpectedBuilder { static std::vector buildExpected(Op &, const InputSets &Inputs, UINT) { @@ -1386,7 +1392,7 @@ struct WaveOpExpectedBuilder { template struct Op : StrictValidation {}; -template struct WaveOpExpectedBuilder { +template struct ExpectedBuilder { static std::vector buildExpected(Op &, const InputSets &Inputs, UINT) { DXASSERT_NOMSG(Inputs.size() == 1); @@ -1404,8 +1410,7 @@ template struct WaveOpExpectedBuilder { template struct Op : StrictValidation {}; -template -struct WaveOpExpectedBuilder { +template struct ExpectedBuilder { static std::vector buildExpected(Op &, const InputSets &Inputs, UINT) { DXASSERT_NOMSG(Inputs.size() == 1); @@ -1478,9 +1483,6 @@ template struct ExpectedBuilder { return Expected; } -}; - -template struct WaveOpExpectedBuilder { static auto buildExpected(Op Op, const InputSets &Inputs, UINT WaveSize) { @@ -1560,8 +1562,7 @@ void dispatchWaveOpTest(ID3D12Device *D3DDevice, bool VerboseLogging, std::vector> Inputs = buildTestInputs(VectorSize, Operation.InputSets, Operation.Arity); - auto Expected = - WaveOpExpectedBuilder::buildExpected(Op, Inputs, WaveSize); + auto Expected = ExpectedBuilder::buildExpected(Op, Inputs, WaveSize); runAndVerify(D3DDevice, VerboseLogging, Operation, Inputs, Expected, Op.ValidationConfig, AdditionalCompilerOptions); @@ -2350,6 +2351,7 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveReadLaneFirst, int16_t); HLK_WAVEOP_TEST(WavePrefixSum, int16_t); HLK_WAVEOP_TEST(WavePrefixProduct, int16_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, int16_t); HLK_WAVEOP_TEST(WaveActiveSum, int32_t); HLK_WAVEOP_TEST(WaveActiveMin, int32_t); HLK_WAVEOP_TEST(WaveActiveMax, int32_t); @@ -2359,6 +2361,7 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveReadLaneFirst, int32_t); HLK_WAVEOP_TEST(WavePrefixSum, int32_t); HLK_WAVEOP_TEST(WavePrefixProduct, int32_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, int32_t); HLK_WAVEOP_TEST(WaveActiveSum, int64_t); HLK_WAVEOP_TEST(WaveActiveMin, int64_t); HLK_WAVEOP_TEST(WaveActiveMax, int64_t); @@ -2368,6 +2371,7 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveReadLaneFirst, int64_t); HLK_WAVEOP_TEST(WavePrefixSum, int64_t); HLK_WAVEOP_TEST(WavePrefixProduct, int64_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, int64_t); HLK_WAVEOP_TEST(WaveActiveSum, uint16_t); HLK_WAVEOP_TEST(WaveActiveMin, uint16_t); @@ -2378,11 +2382,13 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveReadLaneFirst, uint16_t); HLK_WAVEOP_TEST(WavePrefixSum, uint16_t); HLK_WAVEOP_TEST(WavePrefixProduct, uint16_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, uint32_t); HLK_WAVEOP_TEST(WaveActiveSum, uint32_t); HLK_WAVEOP_TEST(WaveActiveMin, uint32_t); HLK_WAVEOP_TEST(WaveActiveMax, uint32_t); HLK_WAVEOP_TEST(WaveActiveProduct, uint32_t); // Note: WaveActiveBit* ops don't support uint16_t in HLSL + // But the WaveMultiPrefixBit ops support all int and uint types HLK_WAVEOP_TEST(WaveActiveBitAnd, uint32_t); HLK_WAVEOP_TEST(WaveActiveBitOr, uint32_t); HLK_WAVEOP_TEST(WaveActiveBitXor, uint32_t); @@ -2391,6 +2397,7 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveReadLaneFirst, uint32_t); HLK_WAVEOP_TEST(WavePrefixSum, uint32_t); HLK_WAVEOP_TEST(WavePrefixProduct, uint32_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, uint16_t); HLK_WAVEOP_TEST(WaveActiveSum, uint64_t); HLK_WAVEOP_TEST(WaveActiveMin, uint64_t); HLK_WAVEOP_TEST(WaveActiveMax, uint64_t); @@ -2403,6 +2410,7 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveReadLaneFirst, uint64_t); HLK_WAVEOP_TEST(WavePrefixSum, uint64_t); HLK_WAVEOP_TEST(WavePrefixProduct, uint64_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, uint64_t); HLK_WAVEOP_TEST(WaveActiveSum, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveMin, HLSLHalf_t); diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index 5bc9f7118e..31e02366ae 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4215,6 +4215,33 @@ void MSMain(uint GID : SV_GroupIndex, } #endif + #ifdef FUNC_WAVE_MULTI_PREFIX_BIT_AND + void TestWaveMultiPrefixBitAnd(vector Vector) + { + const uint LaneCount = WaveGetLaneCount(); + const uint32_t Mask = 0xE; // Lanes 1,2,3. + + if(WaveGetLaneIndex() == 0) + { + // Clear LSB on lane 0 only. This should have no effect since lane 0 + // is not in the mask. + Vector = Vector & ~((OUT_TYPE)0x1); + } + else // On all other lanes mask for the second and third LSB. + { + Vector = (Vector & ((OUT_TYPE)0x6)); + } + + Vector = WaveMultiPrefixBitAnd(Vector, Mask); + if(WaveGetLaneIndex() == 3) + { + // Lane 3 is the last lane in the mask. Store the result from it. + g_OutputVector.Store< vector >(0, Vector); + } + } + #endif + + #ifdef FUNC_TEST_SELECT vector TestSelect(vector Vector1, vector Vector2, From c6b5ecf1d650d46d9d30b90329fb89e6ae9ceaca Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Mon, 17 Nov 2025 15:00:02 -0800 Subject: [PATCH 22/28] Move comment for uint16_t WaveActiveBit ops --- tools/clang/unittests/HLSLExec/LongVectors.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index 1c7562e5f4..e419484b92 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -2373,6 +2373,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WavePrefixProduct, int64_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, int64_t); + // Note: WaveActiveBit* ops don't support uint16_t in HLSL + // But the WaveMultiPrefixBit ops support all int and uint types HLK_WAVEOP_TEST(WaveActiveSum, uint16_t); HLK_WAVEOP_TEST(WaveActiveMin, uint16_t); HLK_WAVEOP_TEST(WaveActiveMax, uint16_t); @@ -2387,8 +2389,6 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveActiveMin, uint32_t); HLK_WAVEOP_TEST(WaveActiveMax, uint32_t); HLK_WAVEOP_TEST(WaveActiveProduct, uint32_t); - // Note: WaveActiveBit* ops don't support uint16_t in HLSL - // But the WaveMultiPrefixBit ops support all int and uint types HLK_WAVEOP_TEST(WaveActiveBitAnd, uint32_t); HLK_WAVEOP_TEST(WaveActiveBitOr, uint32_t); HLK_WAVEOP_TEST(WaveActiveBitXor, uint32_t); From 5edfa7d1154ced14c0ab13938345fc581995c2ba Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Mon, 17 Nov 2025 17:13:18 -0800 Subject: [PATCH 23/28] Xor and some cleanp --- .../unittests/HLSLExec/LongVectorOps.def | 2 + .../clang/unittests/HLSLExec/LongVectors.cpp | 62 ++++++++++++++- .../unittests/HLSLExec/ShaderOpArith.xml | 77 ++++++++++++++++++- 3 files changed, 135 insertions(+), 6 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectorOps.def b/tools/clang/unittests/HLSLExec/LongVectorOps.def index 7b5e69be1f..2b91ab6175 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorOps.def +++ b/tools/clang/unittests/HLSLExec/LongVectorOps.def @@ -209,5 +209,7 @@ OP_DEFAULT_DEFINES(Wave, WaveReadLaneFirst, 1, "TestWaveReadLaneFirst", "", " -D OP_DEFAULT_DEFINES(Wave, WavePrefixSum, 1, "TestWavePrefixSum", "", " -DFUNC_WAVE_PREFIX_SUM=1 -DIS_WAVE_PREFIX_OP=1") OP_DEFAULT_DEFINES(Wave, WavePrefixProduct, 1, "TestWavePrefixProduct", "", " -DFUNC_WAVE_PREFIX_PRODUCT=1 -DIS_WAVE_PREFIX_OP=1") OP(Wave, WaveMultiPrefixBitAnd, 1, "TestWaveMultiPrefixBitAnd", "", " -DFUNC_WAVE_MULTI_PREFIX_BIT_AND=1 -DIS_WAVE_PREFIX_OP=1", "LongVectorOp", WaveMultiPrefixBitwise, Default2, Default3) +OP(Wave, WaveMultiPrefixBitOr, 1, "TestWaveMultiPrefixBitOr", "", " -DFUNC_WAVE_MULTI_PREFIX_BIT_OR=1 -DIS_WAVE_PREFIX_OP=1", "LongVectorOp", WaveMultiPrefixBitwise, Default2, Default3) +OP(Wave, WaveMultiPrefixBitXor, 1, "TestWaveMultiPrefixBitXor", "", " -DFUNC_WAVE_MULTI_PREFIX_BIT_XOR=1 -DIS_WAVE_PREFIX_OP=1", "LongVectorOp", WaveMultiPrefixBitwise, Default2, Default3) #undef OP diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index e419484b92..84167a32a3 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1349,7 +1349,7 @@ template T waveActiveBitAnd(T A, UINT) { WAVE_OP(OpType::WaveActiveBitAnd, (waveActiveBitAnd(A, WaveSize))); template T waveActiveBitOr(T A, UINT) { - // We set the LSB to 0 in one of the lanes. + // We set the LSB to 1 in one of the lanes. return static_cast(A | static_cast(1)); } @@ -1370,6 +1370,50 @@ template T waveMultiPrefixBitAnd(T A, UINT) { return static_cast(A & static_cast(0x6)); } +WAVE_OP(OpType::WaveMultiPrefixBitOr, waveMultiPrefixBitOr(A, WaveSize)); + +template T waveMultiPrefixBitOr(T A, UINT) { + // All lanes in the group mask clear the second LSB. + return static_cast(A & ~static_cast(0x2)); +} + +template +struct Op : StrictValidation {}; + +template struct ExpectedBuilder { + static std::vector + buildExpected(Op &, + const InputSets &Inputs, UINT) { + DXASSERT_NOMSG(Inputs.size() == 1); + + std::vector Expected; + const size_t VectorSize = Inputs[0].size(); + + // We get a little creative for MultiPrefixBitXor. + // The mask we use for the group in the shader is 0xE, that is the 2nd, 3rd, + // and 4th lanes. Prefix ops don't include the value of the current lane in + // their result. + // So, for this test we store the result of WaveMuitiPrefixBitXor from and + // on the 3rd lane. This means the values of two lanes contribute to the + // result. Because this is a Xor, an even number of set bits results in 0, + // and an odd number of set bits results in 1. For this test we simply clear + // the lower half of the input values on lane 2 only. This means that we + // expect the lower half of the out values to match the input. And the + // second half to be all 0s. + for(size_t I = 0; I < VectorSize/2 ; ++I) + Expected.push_back(Inputs[0][I]); + for(size_t I = VectorSize/2; I < VectorSize - 1; ++I) + Expected.push_back(0); + + // We also set the last element to 0 on lane 2 so the last element in the + // output vector matches the last element in the input vector. + Expected.push_back(Inputs[0][VectorSize -1]); + + return Expected; + } +}; + + template struct Op : StrictValidation {}; @@ -2352,6 +2396,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WavePrefixSum, int16_t); HLK_WAVEOP_TEST(WavePrefixProduct, int16_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, int16_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitOr, int16_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitXor, int16_t); HLK_WAVEOP_TEST(WaveActiveSum, int32_t); HLK_WAVEOP_TEST(WaveActiveMin, int32_t); HLK_WAVEOP_TEST(WaveActiveMax, int32_t); @@ -2362,6 +2408,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WavePrefixSum, int32_t); HLK_WAVEOP_TEST(WavePrefixProduct, int32_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, int32_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitOr, int32_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitXor, int32_t); HLK_WAVEOP_TEST(WaveActiveSum, int64_t); HLK_WAVEOP_TEST(WaveActiveMin, int64_t); HLK_WAVEOP_TEST(WaveActiveMax, int64_t); @@ -2372,6 +2420,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WavePrefixSum, int64_t); HLK_WAVEOP_TEST(WavePrefixProduct, int64_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, int64_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitOr, int64_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitXor, int64_t); // Note: WaveActiveBit* ops don't support uint16_t in HLSL // But the WaveMultiPrefixBit ops support all int and uint types @@ -2384,7 +2434,9 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveReadLaneFirst, uint16_t); HLK_WAVEOP_TEST(WavePrefixSum, uint16_t); HLK_WAVEOP_TEST(WavePrefixProduct, uint16_t); - HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, uint32_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, uint16_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitOr, uint16_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitXor, uint16_t); HLK_WAVEOP_TEST(WaveActiveSum, uint32_t); HLK_WAVEOP_TEST(WaveActiveMin, uint32_t); HLK_WAVEOP_TEST(WaveActiveMax, uint32_t); @@ -2397,7 +2449,9 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveReadLaneFirst, uint32_t); HLK_WAVEOP_TEST(WavePrefixSum, uint32_t); HLK_WAVEOP_TEST(WavePrefixProduct, uint32_t); - HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, uint16_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, uint32_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitOr, uint32_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitXor, uint32_t); HLK_WAVEOP_TEST(WaveActiveSum, uint64_t); HLK_WAVEOP_TEST(WaveActiveMin, uint64_t); HLK_WAVEOP_TEST(WaveActiveMax, uint64_t); @@ -2411,6 +2465,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WavePrefixSum, uint64_t); HLK_WAVEOP_TEST(WavePrefixProduct, uint64_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, uint64_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitOr, uint64_t); + HLK_WAVEOP_TEST(WaveMultiPrefixBitXor, uint64_t); HLK_WAVEOP_TEST(WaveActiveSum, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveMin, HLSLHalf_t); diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index 31e02366ae..93f2a97ff6 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4220,11 +4220,14 @@ void MSMain(uint GID : SV_GroupIndex, { const uint LaneCount = WaveGetLaneCount(); const uint32_t Mask = 0xE; // Lanes 1,2,3. + const uint32_t LaneIndex = WaveGetLaneIndex(); - if(WaveGetLaneIndex() == 0) + if(LaneIndex == 0 || LaneIndex == 3) { - // Clear LSB on lane 0 only. This should have no effect since lane 0 - // is not in the mask. + // Clear LSB on lane 0 and lane 3. This ensures that lane 0 which + // isn't in the mask doesn't participate. And because this is a + // prefix op we also want to confirm that lane 3 does not + // participate. Vector = Vector & ~((OUT_TYPE)0x1); } else // On all other lanes mask for the second and third LSB. @@ -4241,6 +4244,74 @@ void MSMain(uint GID : SV_GroupIndex, } #endif + #ifdef FUNC_WAVE_MULTI_PREFIX_BIT_OR + void TestWaveMultiPrefixBitOr(vector Vector) + { + const uint LaneCount = WaveGetLaneCount(); + const uint32_t Mask = 0xE; // Lanes 1,2,3. + const bool IsActiveLaneInMask = ((Mask >> WaveGetLaneIndex()) & 0x1) != 0; + + if(IsActiveLaneInMask) + { + // Lanes inside the group clear the second LSB. + Vector = Vector & ~((OUT_TYPE)0x2); + } + else + { + // Lanes outside the group set the second LSB. + Vector = Vector | ((OUT_TYPE)0x2); + } + + if(WaveGetLaneIndex() == 3) + { + // Set all bits on lane 3 to ensure it doesn't affect the result. + // as this is a prefix op. + Vector = Vector | ~((OUT_TYPE)0x0); + } + + Vector = WaveMultiPrefixBitOr(Vector, Mask); + if(WaveGetLaneIndex() == 3) + { + // Lane 3 is the last lane in the mask. Store the result from it. + g_OutputVector.Store< vector >(0, Vector); + } + } + #endif + + #ifdef FUNC_WAVE_MULTI_PREFIX_BIT_XOR + void TestWaveMultiPrefixBitXor(vector Vector) + { + const uint LaneCount = WaveGetLaneCount(); + const uint32_t Mask = 0xE; // Lanes 1,2,3. + const uint LaneIndex = WaveGetLaneIndex(); + + if(LaneIndex == 0) + { + // Lane 0 is not in the mask. So these values are expected + // to have no effect. + Vector = 0; + } + + if(LaneIndex == 2) + { + // Zero the lower half of the vector on a single lane in the mask. + [unroll] + for(uint I = 0; I < NUM/2; ++I) + { + Vector[I] = 0; + } + + // Same behavior for the last element. + Vector[NUM - 1] = 0; + } + + Vector = WaveMultiPrefixBitXor(Vector, Mask); + if(LaneIndex == 3) + { + g_OutputVector.Store< vector >(0, Vector); + } + } + #endif #ifdef FUNC_TEST_SELECT vector TestSelect(vector Vector1, From f5e38f9062131ab1397588c4be948adac08acd85 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Mon, 17 Nov 2025 17:13:42 -0800 Subject: [PATCH 24/28] Clangity clang format --- .../unittests/HLSLExec/LongVectorTestData.h | 18 ++++++++++++------ tools/clang/unittests/HLSLExec/LongVectors.cpp | 12 +++++------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectorTestData.h b/tools/clang/unittests/HLSLExec/LongVectorTestData.h index d95b5553f9..3fe3aa8610 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorTestData.h +++ b/tools/clang/unittests/HLSLExec/LongVectorTestData.h @@ -290,7 +290,8 @@ INPUT_SET(InputSet::Bitwise, std::numeric_limits::min(), -1, 0, 1, 3, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); INPUT_SET(InputSet::AllOnes, 1); -INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0x10, 0x12, 0xF, static_cast(0xFFFF)); +INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0x10, 0x12, 0xF, + static_cast(0xFFFF)); END_INPUT_SETS() BEGIN_INPUT_SETS(int32_t) @@ -305,7 +306,8 @@ INPUT_SET(InputSet::Bitwise, std::numeric_limits::min(), -1, 0, 1, 3, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); INPUT_SET(InputSet::AllOnes, 1); -INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0x10, 0x12, 0xF, static_cast(0xFFFFFFFF)); +INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0x10, 0x12, 0xF, + static_cast(0xFFFFFFFF)); END_INPUT_SETS() BEGIN_INPUT_SETS(int64_t) @@ -320,7 +322,8 @@ INPUT_SET(InputSet::Bitwise, std::numeric_limits::min(), -1, 0, 1, 3, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); INPUT_SET(InputSet::AllOnes, 1); -INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0x10, 0x12, 0xF, 0xFFFFFFFFFFFFFFFFLL); +INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0x10, 0x12, 0xF, + 0xFFFFFFFFFFFFFFFFLL); END_INPUT_SETS() BEGIN_INPUT_SETS(uint16_t) @@ -332,7 +335,8 @@ INPUT_SET(InputSet::Bitwise, 0, 1, 3, 6, 9, 0x5555, 0xAAAA, 0x8000, 127, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); INPUT_SET(InputSet::AllOnes, 1); -INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0x10, 0x12, 0xF, static_cast(0xFFFF)); +INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0x10, 0x12, 0xF, + static_cast(0xFFFF)); END_INPUT_SETS() BEGIN_INPUT_SETS(uint32_t) @@ -344,7 +348,8 @@ INPUT_SET(InputSet::Bitwise, 0, 1, 3, 6, 9, 0x55555555, 0xAAAAAAAA, 0x80000000, 127, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); INPUT_SET(InputSet::AllOnes, 1); -INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0xA, 0xC, 0xF, 0xFFFFFFFF); +INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0xA, 0xC, 0xF, + 0xFFFFFFFF); END_INPUT_SETS() BEGIN_INPUT_SETS(uint64_t) @@ -357,7 +362,8 @@ INPUT_SET(InputSet::Bitwise, 0, 1, 3, 6, 9, 0x5555555555555555, std::numeric_limits::max()); INPUT_SET(InputSet::SelectCond, 0, 1); INPUT_SET(InputSet::AllOnes, 1); -INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0xA, 0xC, 0xF, 0xFFFFFFFFFFFFFFFF); +INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0xA, 0xC, 0xF, + 0xFFFFFFFFFFFFFFFF); END_INPUT_SETS() BEGIN_INPUT_SETS(HLSLHalf_t) diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index 84167a32a3..cb8b474739 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1381,9 +1381,8 @@ template struct Op : StrictValidation {}; template struct ExpectedBuilder { - static std::vector - buildExpected(Op &, - const InputSets &Inputs, UINT) { + static std::vector buildExpected(Op &, + const InputSets &Inputs, UINT) { DXASSERT_NOMSG(Inputs.size() == 1); std::vector Expected; @@ -1400,20 +1399,19 @@ template struct ExpectedBuilder { // the lower half of the input values on lane 2 only. This means that we // expect the lower half of the out values to match the input. And the // second half to be all 0s. - for(size_t I = 0; I < VectorSize/2 ; ++I) + for (size_t I = 0; I < VectorSize / 2; ++I) Expected.push_back(Inputs[0][I]); - for(size_t I = VectorSize/2; I < VectorSize - 1; ++I) + for (size_t I = VectorSize / 2; I < VectorSize - 1; ++I) Expected.push_back(0); // We also set the last element to 0 on lane 2 so the last element in the // output vector matches the last element in the input vector. - Expected.push_back(Inputs[0][VectorSize -1]); + Expected.push_back(Inputs[0][VectorSize - 1]); return Expected; } }; - template struct Op : StrictValidation {}; From ed88e0ee0958f70184297112aea31b1657f46bc9 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Mon, 17 Nov 2025 18:26:22 -0800 Subject: [PATCH 25/28] Finish multi ops. Needs a little tidy --- .../unittests/HLSLExec/LongVectorOps.def | 2 + .../clang/unittests/HLSLExec/LongVectors.cpp | 37 +++++++++++- .../unittests/HLSLExec/ShaderOpArith.xml | 60 +++++++++++++++++++ 3 files changed, 96 insertions(+), 3 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectorOps.def b/tools/clang/unittests/HLSLExec/LongVectorOps.def index 2b91ab6175..9cf1784239 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorOps.def +++ b/tools/clang/unittests/HLSLExec/LongVectorOps.def @@ -208,6 +208,8 @@ OP_DEFAULT_DEFINES(Wave, WaveReadLaneAt, 1, "TestWaveReadLaneAt", "", " -DFUNC_W OP_DEFAULT_DEFINES(Wave, WaveReadLaneFirst, 1, "TestWaveReadLaneFirst", "", " -DFUNC_WAVE_READ_LANE_FIRST=1") OP_DEFAULT_DEFINES(Wave, WavePrefixSum, 1, "TestWavePrefixSum", "", " -DFUNC_WAVE_PREFIX_SUM=1 -DIS_WAVE_PREFIX_OP=1") OP_DEFAULT_DEFINES(Wave, WavePrefixProduct, 1, "TestWavePrefixProduct", "", " -DFUNC_WAVE_PREFIX_PRODUCT=1 -DIS_WAVE_PREFIX_OP=1") +OP(Wave, WaveMultiPrefixSum, 1, "TestWaveMultiPrefixSum", "", " -DFUNC_WAVE_MULTI_PREFIX_SUM=1 -DIS_WAVE_PREFIX_OP=1", "LongVectorOp", Default1, Default2, Default3) +OP(Wave, WaveMultiPrefixProduct, 1, "TestWaveMultiPrefixProduct", "", " -DFUNC_WAVE_MULTI_PREFIX_PRODUCT=1 -DIS_WAVE_PREFIX_OP=1", "LongVectorOp", Default1, Default2, Default3) OP(Wave, WaveMultiPrefixBitAnd, 1, "TestWaveMultiPrefixBitAnd", "", " -DFUNC_WAVE_MULTI_PREFIX_BIT_AND=1 -DIS_WAVE_PREFIX_OP=1", "LongVectorOp", WaveMultiPrefixBitwise, Default2, Default3) OP(Wave, WaveMultiPrefixBitOr, 1, "TestWaveMultiPrefixBitOr", "", " -DFUNC_WAVE_MULTI_PREFIX_BIT_OR=1 -DIS_WAVE_PREFIX_OP=1", "LongVectorOp", WaveMultiPrefixBitwise, Default2, Default3) OP(Wave, WaveMultiPrefixBitXor, 1, "TestWaveMultiPrefixBitXor", "", " -DFUNC_WAVE_MULTI_PREFIX_BIT_XOR=1 -DIS_WAVE_PREFIX_OP=1", "LongVectorOp", WaveMultiPrefixBitwise, Default2, Default3) diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index cb8b474739..cc6dc464ae 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1470,8 +1470,14 @@ template struct ExpectedBuilder { WAVE_OP(OpType::WavePrefixSum, (wavePrefixSum(A, WaveSize))); template T wavePrefixSum(T A, UINT WaveSize) { - // We test the prefix sume in the 'middle' lane. This choice is arbitrary. - return static_cast(A * static_cast(WaveSize / 2)); + // We test the prefix sum in the 'middle' lane. This choice is arbitrary. + return A * static_cast(WaveSize / 2); +} + +WAVE_OP(OpType::WaveMultiPrefixSum, (waveMultiPrefixSum(A, WaveSize))); + +template T waveMultiPrefixSum(T A, UINT) { + return A * static_cast(2u); } WAVE_OP(OpType::WavePrefixProduct, (wavePrefixProduct(A, WaveSize))); @@ -1479,7 +1485,14 @@ WAVE_OP(OpType::WavePrefixProduct, (wavePrefixProduct(A, WaveSize))); template T wavePrefixProduct(T A, UINT) { // We test the the prefix product in the 3rd lane to avoid overflow issues. // So the result is A * A. - return static_cast(A * A); + return A * A; +} + +WAVE_OP(OpType::WaveMultiPrefixProduct, (waveMultiPrefixProduct(A, WaveSize))); + +template T waveMultiPrefixProduct(T A, UINT) { + // The group mask has 3 lanes. + return A * A; } #undef WAVE_OP @@ -2393,6 +2406,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveReadLaneFirst, int16_t); HLK_WAVEOP_TEST(WavePrefixSum, int16_t); HLK_WAVEOP_TEST(WavePrefixProduct, int16_t); + HLK_WAVEOP_TEST(WaveMultiPrefixSum, int16_t); + HLK_WAVEOP_TEST(WaveMultiPrefixProduct, int16_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, int16_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitOr, int16_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitXor, int16_t); @@ -2404,6 +2419,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveReadLaneAt, int32_t); HLK_WAVEOP_TEST(WaveReadLaneFirst, int32_t); HLK_WAVEOP_TEST(WavePrefixSum, int32_t); + HLK_WAVEOP_TEST(WaveMultiPrefixSum, int32_t); + HLK_WAVEOP_TEST(WaveMultiPrefixProduct, int32_t); HLK_WAVEOP_TEST(WavePrefixProduct, int32_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, int32_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitOr, int32_t); @@ -2417,6 +2434,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveReadLaneFirst, int64_t); HLK_WAVEOP_TEST(WavePrefixSum, int64_t); HLK_WAVEOP_TEST(WavePrefixProduct, int64_t); + HLK_WAVEOP_TEST(WaveMultiPrefixSum, int64_t); + HLK_WAVEOP_TEST(WaveMultiPrefixProduct, int64_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, int64_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitOr, int64_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitXor, int64_t); @@ -2432,6 +2451,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveReadLaneFirst, uint16_t); HLK_WAVEOP_TEST(WavePrefixSum, uint16_t); HLK_WAVEOP_TEST(WavePrefixProduct, uint16_t); + HLK_WAVEOP_TEST(WaveMultiPrefixSum, uint16_t); + HLK_WAVEOP_TEST(WaveMultiPrefixProduct, uint16_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, uint16_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitOr, uint16_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitXor, uint16_t); @@ -2447,6 +2468,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveReadLaneFirst, uint32_t); HLK_WAVEOP_TEST(WavePrefixSum, uint32_t); HLK_WAVEOP_TEST(WavePrefixProduct, uint32_t); + HLK_WAVEOP_TEST(WaveMultiPrefixSum, uint32_t); + HLK_WAVEOP_TEST(WaveMultiPrefixProduct, uint32_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, uint32_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitOr, uint32_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitXor, uint32_t); @@ -2462,6 +2485,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveReadLaneFirst, uint64_t); HLK_WAVEOP_TEST(WavePrefixSum, uint64_t); HLK_WAVEOP_TEST(WavePrefixProduct, uint64_t); + HLK_WAVEOP_TEST(WaveMultiPrefixSum, uint64_t); + HLK_WAVEOP_TEST(WaveMultiPrefixProduct, uint64_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitAnd, uint64_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitOr, uint64_t); HLK_WAVEOP_TEST(WaveMultiPrefixBitXor, uint64_t); @@ -2475,6 +2500,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveReadLaneFirst, HLSLHalf_t); HLK_WAVEOP_TEST(WavePrefixSum, HLSLHalf_t); HLK_WAVEOP_TEST(WavePrefixProduct, HLSLHalf_t); + HLK_WAVEOP_TEST(WaveMultiPrefixSum, HLSLHalf_t); + HLK_WAVEOP_TEST(WaveMultiPrefixProduct, HLSLHalf_t); HLK_WAVEOP_TEST(WaveActiveSum, float); HLK_WAVEOP_TEST(WaveActiveMin, float); HLK_WAVEOP_TEST(WaveActiveMax, float); @@ -2484,6 +2511,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveReadLaneFirst, float); HLK_WAVEOP_TEST(WavePrefixSum, float); HLK_WAVEOP_TEST(WavePrefixProduct, float); + HLK_WAVEOP_TEST(WaveMultiPrefixSum, float); + HLK_WAVEOP_TEST(WaveMultiPrefixProduct, float); HLK_WAVEOP_TEST(WaveActiveSum, double); HLK_WAVEOP_TEST(WaveActiveMin, double); HLK_WAVEOP_TEST(WaveActiveMax, double); @@ -2493,6 +2522,8 @@ class DxilConf_SM69_Vectorized { HLK_WAVEOP_TEST(WaveReadLaneFirst, double); HLK_WAVEOP_TEST(WavePrefixSum, double); HLK_WAVEOP_TEST(WavePrefixProduct, double); + HLK_WAVEOP_TEST(WaveMultiPrefixSum, double); + HLK_WAVEOP_TEST(WaveMultiPrefixProduct, double); private: bool Initialized = false; diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index 93f2a97ff6..1900309bd7 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4215,6 +4215,66 @@ void MSMain(uint GID : SV_GroupIndex, } #endif + #ifdef FUNC_WAVE_MULTI_PREFIX_SUM + void TestWaveMultiPrefixSum(vector Vector) + { + const uint LaneCount = WaveGetLaneCount(); + const uint32_t Mask = 0xE; // Lanes 1,2,3. + const uint LaneIndex = WaveGetLaneIndex(); + + if(LaneIndex == 0) + { + // Lane 0 isn't in the mask. Shove in a value to make sure it + // doesn't constribute to the result. + Vector = 1; + } + + if(LaneIndex == 3) + { + // Lane 3 is the last lane in the mask. We want to make sure + // it doesn't contribute to the result as this is a prefix op. + Vector = 10; + } + + Vector = WaveMultiPrefixSum(Vector, Mask); + if(WaveGetLaneIndex() == 3) + { + // Lane 3 is the last lane in the mask. Store the result from it. + g_OutputVector.Store< vector >(0, Vector); + } + } + #endif + + #ifdef FUNC_WAVE_MULTI_PREFIX_PRODUCT + void TestWaveMultiPrefixProduct(vector Vector) + { + const uint LaneCount = WaveGetLaneCount(); + const uint32_t Mask = 0xE; // Lanes 1,2,3. + const uint LaneIndex = WaveGetLaneIndex(); + + if(LaneIndex == 0) + { + // Lane 0 isn't in the mask. Shove in a value to make sure it + // doesn't constribute to the result. + Vector = 4; + } + + if(LaneIndex == 3) + { + // Lane 3 is the last lane in the mask. We want to make sure + // it doesn't contribute to the result as this is a prefix op. + Vector = 10; + } + + Vector = WaveMultiPrefixProduct(Vector, Mask); + if(WaveGetLaneIndex() == 3) + { + // Lane 3 is the last lane in the mask. Store the result from it. + g_OutputVector.Store< vector >(0, Vector); + } + } + #endif + #ifdef FUNC_WAVE_MULTI_PREFIX_BIT_AND void TestWaveMultiPrefixBitAnd(vector Vector) { From e80ce9195372f651c8b1ebd527fbfae17fc2b8eb Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Mon, 17 Nov 2025 18:55:39 -0800 Subject: [PATCH 26/28] Comment cleanup. Fix typo --- .../clang/unittests/HLSLExec/LongVectors.cpp | 26 +++++++------ .../unittests/HLSLExec/ShaderOpArith.xml | 38 +++++++++++-------- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectors.cpp b/tools/clang/unittests/HLSLExec/LongVectors.cpp index cc6dc464ae..a52eb56581 100644 --- a/tools/clang/unittests/HLSLExec/LongVectors.cpp +++ b/tools/clang/unittests/HLSLExec/LongVectors.cpp @@ -1388,17 +1388,21 @@ template struct ExpectedBuilder { std::vector Expected; const size_t VectorSize = Inputs[0].size(); - // We get a little creative for MultiPrefixBitXor. - // The mask we use for the group in the shader is 0xE, that is the 2nd, 3rd, - // and 4th lanes. Prefix ops don't include the value of the current lane in - // their result. - // So, for this test we store the result of WaveMuitiPrefixBitXor from and - // on the 3rd lane. This means the values of two lanes contribute to the - // result. Because this is a Xor, an even number of set bits results in 0, - // and an odd number of set bits results in 1. For this test we simply clear - // the lower half of the input values on lane 2 only. This means that we - // expect the lower half of the out values to match the input. And the - // second half to be all 0s. + // We get a little creative for MultiPrefixBitXor. The mask we use for the + // group in the shader is 0xE (0b1110), which includes lanes 1, 2, and 3. + // Prefix ops don't include the value of the current lane in their result. + // So, for this test we store the result of WaveMultiPrefixBitXor from lane + // 3. This means only the values from lanes 1 and 2 contribute to the result + // at lane 3. + // + // In the shader: + // - Lane 0: Set to 0 (not in mask, shouldn't affect result) + // - Lane 1: Keeps original input values + // - Lane 2: Lower half + last element set to 0, upper half keeps input + // - Lane 3: Stores the prefix XOR result (lanes 1 XOR lanes 2) + // + // Expected result: Lower half matches input (lane 1 XOR 0), upper half is + // 0s, except last element matches input. for (size_t I = 0; I < VectorSize / 2; ++I) Expected.push_back(Inputs[0][I]); for (size_t I = VectorSize / 2; I < VectorSize - 1; ++I) diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index 1900309bd7..82cfde9d89 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4279,19 +4279,19 @@ void MSMain(uint GID : SV_GroupIndex, void TestWaveMultiPrefixBitAnd(vector Vector) { const uint LaneCount = WaveGetLaneCount(); - const uint32_t Mask = 0xE; // Lanes 1,2,3. + const uint32_t Mask = 0xE; // Lanes 1,2,3 (0xE = 0b1110) const uint32_t LaneIndex = WaveGetLaneIndex(); if(LaneIndex == 0 || LaneIndex == 3) { - // Clear LSB on lane 0 and lane 3. This ensures that lane 0 which - // isn't in the mask doesn't participate. And because this is a - // prefix op we also want to confirm that lane 3 does not - // participate. + // Clear LSB on lane 0 and lane 3. Lane 0 isn't in the mask so + // shouldn't participate. Lane 3 is the output lane for this prefix + // op, so we set distinctive bits to verify it doesn't affect its own result. Vector = Vector & ~((OUT_TYPE)0x1); } - else // On all other lanes mask for the second and third LSB. + else // Lanes 1,2 (active contributors to the prefix operation) { + // Keep only bits 1 and 2 (0x6 = 0b0110) to create predictable AND patterns Vector = (Vector & ((OUT_TYPE)0x6)); } @@ -4308,24 +4308,26 @@ void MSMain(uint GID : SV_GroupIndex, void TestWaveMultiPrefixBitOr(vector Vector) { const uint LaneCount = WaveGetLaneCount(); - const uint32_t Mask = 0xE; // Lanes 1,2,3. + const uint32_t Mask = 0xE; // Lanes 1,2,3 (0xE = 0b1110) const bool IsActiveLaneInMask = ((Mask >> WaveGetLaneIndex()) & 0x1) != 0; if(IsActiveLaneInMask) { - // Lanes inside the group clear the second LSB. + // Lanes 1,2,3 (inside the mask): Clear bit 1 (0x2) to create + // predictable OR patterns Vector = Vector & ~((OUT_TYPE)0x2); } else { - // Lanes outside the group set the second LSB. + // Lane 0 (outside the mask): Set bit 1 to verify this lane + // doesn't contribute to the result Vector = Vector | ((OUT_TYPE)0x2); } if(WaveGetLaneIndex() == 3) { - // Set all bits on lane 3 to ensure it doesn't affect the result. - // as this is a prefix op. + // Lane 3 is the output lane: Set all bits to verify it doesn't + // affect its own prefix result (since prefix excludes current lane) Vector = Vector | ~((OUT_TYPE)0x0); } @@ -4342,32 +4344,36 @@ void MSMain(uint GID : SV_GroupIndex, void TestWaveMultiPrefixBitXor(vector Vector) { const uint LaneCount = WaveGetLaneCount(); - const uint32_t Mask = 0xE; // Lanes 1,2,3. + const uint32_t Mask = 0xE; // Lanes 1,2,3 (0xE = 0b1110) const uint LaneIndex = WaveGetLaneIndex(); if(LaneIndex == 0) { - // Lane 0 is not in the mask. So these values are expected - // to have no effect. + // Lane 0 is not in the mask, so these values should have no effect + // on the prefix result. Set to 0 to verify exclusion. Vector = 0; } if(LaneIndex == 2) { - // Zero the lower half of the vector on a single lane in the mask. + // Lane 2: Create a specific pattern for XOR testing. + // Zero the lower half of the vector to create predictable XOR results. [unroll] for(uint I = 0; I < NUM/2; ++I) { Vector[I] = 0; } - // Same behavior for the last element. + // Also zero the last element to test edge cases Vector[NUM - 1] = 0; } + // Lane 1 and 3: Keep original input values + // Lane 3 will store the result (lane 1 XOR lane 2 prefix) Vector = WaveMultiPrefixBitXor(Vector, Mask); if(LaneIndex == 3) { + // Store result from lane 3 (last lane in mask) g_OutputVector.Store< vector >(0, Vector); } } From cd9f2e771cea9cd7b656af3bb1afc1536bdbe0de Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Thu, 20 Nov 2025 19:37:13 -0800 Subject: [PATCH 27/28] Update to use numeric limits and -1 in bitwise sets --- tools/clang/unittests/HLSLExec/LongVectorTestData.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LongVectorTestData.h b/tools/clang/unittests/HLSLExec/LongVectorTestData.h index 3fe3aa8610..ce32b4c035 100644 --- a/tools/clang/unittests/HLSLExec/LongVectorTestData.h +++ b/tools/clang/unittests/HLSLExec/LongVectorTestData.h @@ -291,7 +291,7 @@ INPUT_SET(InputSet::Bitwise, std::numeric_limits::min(), -1, 0, 1, 3, INPUT_SET(InputSet::SelectCond, 0, 1); INPUT_SET(InputSet::AllOnes, 1); INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0x10, 0x12, 0xF, - static_cast(0xFFFF)); + -1); END_INPUT_SETS() BEGIN_INPUT_SETS(int32_t) @@ -307,7 +307,7 @@ INPUT_SET(InputSet::Bitwise, std::numeric_limits::min(), -1, 0, 1, 3, INPUT_SET(InputSet::SelectCond, 0, 1); INPUT_SET(InputSet::AllOnes, 1); INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0x10, 0x12, 0xF, - static_cast(0xFFFFFFFF)); + -1); END_INPUT_SETS() BEGIN_INPUT_SETS(int64_t) @@ -323,7 +323,7 @@ INPUT_SET(InputSet::Bitwise, std::numeric_limits::min(), -1, 0, 1, 3, INPUT_SET(InputSet::SelectCond, 0, 1); INPUT_SET(InputSet::AllOnes, 1); INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0x10, 0x12, 0xF, - 0xFFFFFFFFFFFFFFFFLL); + -1ll); END_INPUT_SETS() BEGIN_INPUT_SETS(uint16_t) @@ -336,7 +336,7 @@ INPUT_SET(InputSet::Bitwise, 0, 1, 3, 6, 9, 0x5555, 0xAAAA, 0x8000, 127, INPUT_SET(InputSet::SelectCond, 0, 1); INPUT_SET(InputSet::AllOnes, 1); INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0x10, 0x12, 0xF, - static_cast(0xFFFF)); + std::numeric_limits::max()); END_INPUT_SETS() BEGIN_INPUT_SETS(uint32_t) @@ -349,7 +349,7 @@ INPUT_SET(InputSet::Bitwise, 0, 1, 3, 6, 9, 0x55555555, 0xAAAAAAAA, 0x80000000, INPUT_SET(InputSet::SelectCond, 0, 1); INPUT_SET(InputSet::AllOnes, 1); INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0xA, 0xC, 0xF, - 0xFFFFFFFF); + std::numeric_limits::max()); END_INPUT_SETS() BEGIN_INPUT_SETS(uint64_t) @@ -363,7 +363,7 @@ INPUT_SET(InputSet::Bitwise, 0, 1, 3, 6, 9, 0x5555555555555555, INPUT_SET(InputSet::SelectCond, 0, 1); INPUT_SET(InputSet::AllOnes, 1); INPUT_SET(InputSet::WaveMultiPrefixBitwise, 0x0, 0x1, 0x3, 0x4, 0xA, 0xC, 0xF, - 0xFFFFFFFFFFFFFFFF); + std::numeric_limits::max()); END_INPUT_SETS() BEGIN_INPUT_SETS(HLSLHalf_t) From d5f450ab1c48d534db7f057f49742421a1652724 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com> Date: Thu, 20 Nov 2025 20:59:57 -0800 Subject: [PATCH 28/28] Use keys and WaveMatch for the WaveMulti tests --- .../unittests/HLSLExec/ShaderOpArith.xml | 61 +++++++++++-------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml index 82cfde9d89..fe238bdfed 100644 --- a/tools/clang/unittests/HLSLExec/ShaderOpArith.xml +++ b/tools/clang/unittests/HLSLExec/ShaderOpArith.xml @@ -4218,18 +4218,20 @@ void MSMain(uint GID : SV_GroupIndex, #ifdef FUNC_WAVE_MULTI_PREFIX_SUM void TestWaveMultiPrefixSum(vector Vector) { - const uint LaneCount = WaveGetLaneCount(); - const uint32_t Mask = 0xE; // Lanes 1,2,3. - const uint LaneIndex = WaveGetLaneIndex(); + uint Key = (WaveGetLaneIndex() == 1 || WaveGetLaneIndex() == 2 || WaveGetLaneIndex() == 3) ? 1u : 0u; + + // Two groups. Lanes 1,2,3 in one group (Key=1), Lanes 0,(4..N) in + // other (Key=0). + uint4 Mask = WaveMatch(Key); - if(LaneIndex == 0) + if(WaveGetLaneIndex() == 0) { // Lane 0 isn't in the mask. Shove in a value to make sure it // doesn't constribute to the result. Vector = 1; } - if(LaneIndex == 3) + if(WaveGetLaneIndex() >= 3) { // Lane 3 is the last lane in the mask. We want to make sure // it doesn't contribute to the result as this is a prefix op. @@ -4239,7 +4241,8 @@ void MSMain(uint GID : SV_GroupIndex, Vector = WaveMultiPrefixSum(Vector, Mask); if(WaveGetLaneIndex() == 3) { - // Lane 3 is the last lane in the mask. Store the result from it. + // Lane 3 is the last lane in the mask that we care about. Store the + // result from it. g_OutputVector.Store< vector >(0, Vector); } } @@ -4248,18 +4251,20 @@ void MSMain(uint GID : SV_GroupIndex, #ifdef FUNC_WAVE_MULTI_PREFIX_PRODUCT void TestWaveMultiPrefixProduct(vector Vector) { - const uint LaneCount = WaveGetLaneCount(); - const uint32_t Mask = 0xE; // Lanes 1,2,3. - const uint LaneIndex = WaveGetLaneIndex(); + uint Key = (WaveGetLaneIndex() == 1 || WaveGetLaneIndex() == 2 || WaveGetLaneIndex() == 3) ? 1u : 0u; - if(LaneIndex == 0) + // Two groups. Lanes 1,2,3 in one group (Key=1), Lanes 0,(4..N) in + // other (Key=0). + uint4 Mask = WaveMatch(Key); + + if(WaveGetLaneIndex() == 0) { // Lane 0 isn't in the mask. Shove in a value to make sure it // doesn't constribute to the result. Vector = 4; } - if(LaneIndex == 3) + if(WaveGetLaneIndex() == 3) { // Lane 3 is the last lane in the mask. We want to make sure // it doesn't contribute to the result as this is a prefix op. @@ -4278,11 +4283,13 @@ void MSMain(uint GID : SV_GroupIndex, #ifdef FUNC_WAVE_MULTI_PREFIX_BIT_AND void TestWaveMultiPrefixBitAnd(vector Vector) { - const uint LaneCount = WaveGetLaneCount(); - const uint32_t Mask = 0xE; // Lanes 1,2,3 (0xE = 0b1110) - const uint32_t LaneIndex = WaveGetLaneIndex(); + uint Key = (WaveGetLaneIndex() == 1 || WaveGetLaneIndex() == 2 || WaveGetLaneIndex() == 3) ? 1u : 0u; - if(LaneIndex == 0 || LaneIndex == 3) + // Two groups. Lanes 1,2,3 in one group (Key=1), Lanes 0,(4..N) in + // other (Key=0). + uint4 Mask = WaveMatch(Key); + + if(WaveGetLaneIndex() == 0 || WaveGetLaneIndex() == 3) { // Clear LSB on lane 0 and lane 3. Lane 0 isn't in the mask so // shouldn't participate. Lane 3 is the output lane for this prefix @@ -4307,11 +4314,13 @@ void MSMain(uint GID : SV_GroupIndex, #ifdef FUNC_WAVE_MULTI_PREFIX_BIT_OR void TestWaveMultiPrefixBitOr(vector Vector) { - const uint LaneCount = WaveGetLaneCount(); - const uint32_t Mask = 0xE; // Lanes 1,2,3 (0xE = 0b1110) - const bool IsActiveLaneInMask = ((Mask >> WaveGetLaneIndex()) & 0x1) != 0; + uint Key = (WaveGetLaneIndex() == 1 || WaveGetLaneIndex() == 2 || WaveGetLaneIndex() == 3) ? 1u : 0u; - if(IsActiveLaneInMask) + // Two groups. Lanes 1,2,3 in one group (Key=1), Lanes 0,(4..N) in + // other (Key=0). + uint4 Mask = WaveMatch(Key); + + if(WaveGetLaneIndex() == 1 || WaveGetLaneIndex() == 2 || WaveGetLaneIndex() == 3) { // Lanes 1,2,3 (inside the mask): Clear bit 1 (0x2) to create // predictable OR patterns @@ -4343,18 +4352,20 @@ void MSMain(uint GID : SV_GroupIndex, #ifdef FUNC_WAVE_MULTI_PREFIX_BIT_XOR void TestWaveMultiPrefixBitXor(vector Vector) { - const uint LaneCount = WaveGetLaneCount(); - const uint32_t Mask = 0xE; // Lanes 1,2,3 (0xE = 0b1110) - const uint LaneIndex = WaveGetLaneIndex(); + uint Key = (WaveGetLaneIndex() == 1 || WaveGetLaneIndex() == 2 || WaveGetLaneIndex() == 3) ? 1u : 0u; - if(LaneIndex == 0) + // Two groups. Lanes 1,2,3 in one group (Key=1), Lanes 0,(4..N) in + // other (Key=0). + uint4 Mask = WaveMatch(Key); + + if(WaveGetLaneIndex() == 0) { // Lane 0 is not in the mask, so these values should have no effect // on the prefix result. Set to 0 to verify exclusion. Vector = 0; } - if(LaneIndex == 2) + if(WaveGetLaneIndex() == 2) { // Lane 2: Create a specific pattern for XOR testing. // Zero the lower half of the vector to create predictable XOR results. @@ -4371,7 +4382,7 @@ void MSMain(uint GID : SV_GroupIndex, // Lane 3 will store the result (lane 1 XOR lane 2 prefix) Vector = WaveMultiPrefixBitXor(Vector, Mask); - if(LaneIndex == 3) + if(WaveGetLaneIndex() == 3) { // Store result from lane 3 (last lane in mask) g_OutputVector.Store< vector >(0, Vector);