diff --git a/13_BitonicSort/CMakeLists.txt b/13_BitonicSort/CMakeLists.txt new file mode 100644 index 000000000..b7cad41da --- /dev/null +++ b/13_BitonicSort/CMakeLists.txt @@ -0,0 +1,24 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() diff --git a/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl new file mode 100644 index 000000000..3ea39262f --- /dev/null +++ b/13_BitonicSort/app_resources/bitonic_sort_shader.comp.hlsl @@ -0,0 +1,70 @@ +#include "common.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" +#include "nbl/builtin/hlsl/workgroup/bitonic_sort.hlsl" + +[[vk::push_constant]] PushConstantData pushConstants; + +using namespace nbl::hlsl; + +using BitonicSortConfig = workgroup::bitonic_sort::bitonic_sort_config >; + +NBL_CONSTEXPR uint32_t WorkgroupSize = BitonicSortConfig::WorkgroupSize; + +groupshared uint32_t sharedmem[BitonicSortConfig::SharedmemDWORDs]; + +uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(BitonicSortConfig::WorkgroupSize), 1, 1); } + +struct SharedMemoryAccessor +{ + template + void set(IndexType idx, AccessType value) + { + sharedmem[idx] = value; + } + + template + void get(IndexType idx, NBL_REF_ARG(AccessType) value) + { + value = sharedmem[idx]; + } + + void workgroupExecutionAndMemoryBarrier() + { + glsl::barrier(); + } +}; + +struct Accessor +{ + static Accessor create(const uint64_t address) + { + Accessor accessor; + accessor.address = address; + return accessor; + } + + template + void get(const IndexType index, NBL_REF_ARG(AccessType) value) + { + value = vk::RawBufferLoad(address + index * sizeof(AccessType)); + } + + template + void set(const IndexType index, const AccessType value) + { + vk::RawBufferStore(address + index * sizeof(AccessType), value); + } + + uint64_t address; +}; + +[numthreads(BitonicSortConfig::WorkgroupSize, 1, 1)] +[shader("compute")] +void main() +{ + Accessor accessor = Accessor::create(pushConstants.deviceBufferAddress); + SharedMemoryAccessor sharedmemAccessor; + + // The sort handles load/store internally + workgroup::BitonicSort::template __call(accessor, sharedmemAccessor); +} diff --git a/13_BitonicSort/app_resources/common.hlsl b/13_BitonicSort/app_resources/common.hlsl new file mode 100644 index 000000000..9f8f4dd3b --- /dev/null +++ b/13_BitonicSort/app_resources/common.hlsl @@ -0,0 +1,13 @@ +#ifndef _BITONIC_SORT_COMMON_INCLUDED_ +#define _BITONIC_SORT_COMMON_INCLUDED_ +#include "nbl/builtin/hlsl/cpp_compat.hlsl" + +struct PushConstantData +{ + uint64_t deviceBufferAddress; +}; + +NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 10; // 1024 threads (2^10) +NBL_CONSTEXPR uint32_t ElementsPerThreadLog2 = 2; // 4 elements per thread (2^2) - VIRTUAL THREADING! +NBL_CONSTEXPR uint32_t elementCount = uint32_t(1) << (WorkgroupSizeLog2 + ElementsPerThreadLog2); // 4096 elements (2^12) +#endif diff --git a/13_BitonicSort/config.json.template b/13_BitonicSort/config.json.template new file mode 100644 index 000000000..12215d0bb --- /dev/null +++ b/13_BitonicSort/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", // should be none + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} diff --git a/13_BitonicSort/main.cpp b/13_BitonicSort/main.cpp new file mode 100644 index 000000000..c1b1515c2 --- /dev/null +++ b/13_BitonicSort/main.cpp @@ -0,0 +1,319 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + + +#include "nbl/examples/examples.hpp" + +using namespace nbl; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; + +#include "app_resources/common.hlsl" +#include "nbl/builtin/hlsl/bit.hlsl" + + +// Simple showcase of how to run Bitonic Sort on a 1D array using workgroup operations +class BitonicSortApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication +{ + using device_base_t = application_templates::MonoDeviceApplication; + using asset_base_t = BuiltinResourcesApplication; + + smart_refctd_ptr m_pipeline; + smart_refctd_ptr m_layout; + + smart_refctd_ptr m_utils; + + nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer; + StreamingTransientDataBufferMT<>* m_downStreamingBuffer; + smart_refctd_ptr m_deviceLocalBuffer; + + // These are Buffer Device Addresses + uint64_t m_upStreamingBufferAddress; + uint64_t m_downStreamingBufferAddress; + uint64_t m_deviceLocalBufferAddress; + + uint32_t m_alignment; + + smart_refctd_ptr m_timeline; + uint64_t semaphoreValue = 0; + +public: + + BitonicSortApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + // Load shader + auto prepShader = [&](const core::string& path) -> smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = ""; + auto assetBundle = m_assetMgr->getAsset(path, lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + { + logFail("Could not load shader!"); + return nullptr; + } + + auto source = IAsset::castDown(assets[0]); + assert(source); + + auto shader = m_device->compileShader({ source.get() }); + if (!shader) + { + logFail("Creation of Bitonic Sort Shader failed!"); + return nullptr; + } + return shader; + }; + + auto bitonicSortShader = prepShader("app_resources/bitonic_sort_shader.comp.hlsl"); + if (!bitonicSortShader) + return logFail("Failed to compile bitonic sort shader!"); + + m_utils = video::IUtilities::create(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger)); + if (!m_utils) + return logFail("Failed to create Utilities!"); + m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer(); + m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer(); + m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress(); + m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress(); + + // Create device-local buffer + { + IGPUBuffer::SCreationParams deviceLocalBufferParams = {}; + + IQueue* const queue = getComputeQueue(); + uint32_t queueFamilyIndex = queue->getFamilyIndex(); + + deviceLocalBufferParams.queueFamilyIndexCount = 1; + deviceLocalBufferParams.queueFamilyIndices = &queueFamilyIndex; + deviceLocalBufferParams.size = sizeof(uint32_t) * elementCount * 2; // *2 because we store (key, value) pairs + deviceLocalBufferParams.usage = nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT; + + m_deviceLocalBuffer = m_device->createBuffer(std::move(deviceLocalBufferParams)); + auto mreqs = m_deviceLocalBuffer->getMemoryReqs(); + mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); + auto gpubufMem = m_device->allocate(mreqs, m_deviceLocalBuffer.get(), IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT); + + m_deviceLocalBufferAddress = m_deviceLocalBuffer.get()->getDeviceAddress(); + } + + const nbl::asset::SPushConstantRange pcRange = { + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, + .offset = 0, + .size = sizeof(PushConstantData) + }; + + { + m_layout = m_device->createPipelineLayout({ &pcRange,1 }); + IGPUComputePipeline::SCreationParams params = {}; + params.layout = m_layout.get(); + params.shader.shader = bitonicSortShader.get(); + params.shader.entryPoint = "main"; + params.shader.requiredSubgroupSize = static_cast(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize)); + params.cached.requireFullSubgroups = true; + if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) + return logFail("Failed to create compute pipeline!\n"); + } + + const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits(); + m_alignment = core::max(deviceLimits.nonCoherentAtomSize, alignof(uint32_t)); + + m_timeline = m_device->createSemaphore(semaphoreValue); + + IQueue* const queue = getComputeQueue(); + + const uint32_t inputSize = sizeof(uint32_t) * elementCount * 2; // *2 because we store (key, value) pairs + + const uint32_t AllocationCount = 1; + + auto inputOffset = m_upStreamingBuffer->invalid_value; + + std::chrono::steady_clock::time_point waitTill(std::chrono::years(45)); + m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment); + + { + auto* const inputPtr = reinterpret_cast(reinterpret_cast(m_upStreamingBuffer->getBufferPointer()) + inputOffset); + + // Generate random input data + unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::mt19937 g(seed); + + std::cout << "Input array: "; + for (uint32_t i = 0; i < elementCount; i++) { + uint32_t key = g() % 10000; + uint32_t value = i; // Use index as value for stable sorting + inputPtr[i * 2] = key; + inputPtr[i * 2 + 1] = value; + std::cout << "(" << key << "," << value << "), "; + if ((i + 1) % 20 == 0) { + std::cout << "\n"; + } + } + std::cout << "\nElement count: " << elementCount << "\n"; + + // Always remember to flush! + if (m_upStreamingBuffer->needsManualFlushOrInvalidate()) + { + const auto bound = m_upStreamingBuffer->getBuffer()->getBoundMemory(); + const ILogicalDevice::MappedMemoryRange range(bound.memory, bound.offset + inputOffset, inputSize); + m_device->flushMappedMemoryRanges(1, &range); + } + } + + const uint32_t outputSize = inputSize; + + auto outputOffset = m_downStreamingBuffer->invalid_value; + m_downStreamingBuffer->multi_allocate(waitTill, AllocationCount, &outputOffset, &outputSize, &m_alignment); + + smart_refctd_ptr cmdbuf; + { + smart_refctd_ptr cmdpool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) { + return logFail("Failed to create Command Buffers!\n"); + } + cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 }, core::smart_refctd_ptr(m_logger)); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmdbuf->bindComputePipeline(m_pipeline.get()); + + const PushConstantData pc = {.deviceBufferAddress = m_deviceLocalBufferAddress}; + + IGPUCommandBuffer::SBufferCopy copyInfo = {}; + copyInfo.srcOffset = inputOffset; + copyInfo.dstOffset = 0; + copyInfo.size = m_deviceLocalBuffer->getSize(); + cmdbuf->copyBuffer(m_upStreamingBuffer->getBuffer(), m_deviceLocalBuffer.get(), 1, ©Info); + + IGPUCommandBuffer::SPipelineBarrierDependencyInfo pipelineBarrierInfo1 = {}; + decltype(pipelineBarrierInfo1)::buffer_barrier_t barrier1 = {}; + pipelineBarrierInfo1.bufBarriers = { &barrier1, 1u }; + barrier1.range.buffer = m_deviceLocalBuffer; + barrier1.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT; + barrier1.barrier.dep.srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS; + barrier1.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + barrier1.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS; + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo1); + + cmdbuf->pushConstants(m_pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); + + cmdbuf->dispatch(1, 1, 1); + + IGPUCommandBuffer::SPipelineBarrierDependencyInfo pipelineBarrierInfo2 = {}; + decltype(pipelineBarrierInfo2)::buffer_barrier_t barrier2 = {}; + pipelineBarrierInfo2.bufBarriers = { &barrier2, 1u }; + barrier2.range.buffer = m_deviceLocalBuffer; + barrier2.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + barrier2.barrier.dep.srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS; + barrier2.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT; + barrier2.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS; + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo2); + + copyInfo.srcOffset = 0; + copyInfo.dstOffset = outputOffset; + cmdbuf->copyBuffer(m_deviceLocalBuffer.get(), m_downStreamingBuffer->getBuffer(), 1, ©Info); + cmdbuf->end(); + } + + semaphoreValue++; + { + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = + { + .cmdbuf = cmdbuf.get() + }; + const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = + { + .semaphore = m_timeline.get(), + .value = semaphoreValue, + .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + }; + + const IQueue::SSubmitInfo submitInfo = { + .waitSemaphores = {}, + .commandBuffers = {&cmdbufInfo,1}, + .signalSemaphores = {&signalInfo,1} + }; + + m_api->startCapture(); + queue->submit({ &submitInfo,1 }); + m_api->endCapture(); + } + + const ISemaphore::SWaitInfo futureWait = { m_timeline.get(),semaphoreValue }; + + m_upStreamingBuffer->multi_deallocate(AllocationCount, &inputOffset, &inputSize, futureWait); + + auto latchedConsumer = make_smart_refctd_ptr( + IDeviceMemoryAllocation::MemoryRange(outputOffset, outputSize), + [=](const size_t dstOffset, const void* bufSrc, const size_t size)->void + { + assert(dstOffset == 0 && size == outputSize); + + std::cout << "Sorted array: "; + const uint32_t* const data = reinterpret_cast(bufSrc); + for (auto i = 0u; i < elementCount; i++) { + uint32_t key = data[i * 2]; + uint32_t value = data[i * 2 + 1]; + std::cout << "(" << key << "," << value << "), "; + if ((i + 1) % 20 == 0) { + std::cout << "\n"; + } + } + std::cout << "\nElement count: " << elementCount << "\n"; + + bool is_sorted = true; + int32_t error_index = -1; + for (uint32_t i = 1; i < elementCount; i++) { + uint32_t prevKey = data[(i - 1) * 2]; + uint32_t currKey = data[i * 2]; + if (currKey < prevKey) { + is_sorted = false; + error_index = i; + break; + } + } + + if (is_sorted) { + std::cout << "Array is correctly sorted!\n"; + } + else { + std::cout << "Array is NOT sorted correctly!\n"; + std::cout << "Error at index " << error_index << ":\n"; + std::cout << " Previous key [" << (error_index - 1) << "] = " << data[(error_index - 1) * 2] << "\n"; + std::cout << " Current key [" << error_index << "] = " << data[error_index * 2] << "\n"; + std::cout << " (" << data[error_index * 2] << " < " << data[(error_index - 1) * 2] << " is WRONG!)\n"; + } + }, + std::move(cmdbuf), m_downStreamingBuffer + ); + m_downStreamingBuffer->multi_deallocate(AllocationCount, &outputOffset, &outputSize, futureWait, &latchedConsumer.get()); + + return true; + } + + bool keepRunning() override { return false; } + + void workLoopBody() override {} + + bool onAppTerminated() override + { + while (m_downStreamingBuffer->cull_frees()) {} + return device_base_t::onAppTerminated(); + } +}; + +NBL_MAIN_FUNC(BitonicSortApp) diff --git a/13_BitonicSort/pipeline.groovy b/13_BitonicSort/pipeline.groovy new file mode 100644 index 000000000..0af4402e6 --- /dev/null +++ b/13_BitonicSort/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CCountingSortBuilder extends IBuilder +{ + public CCountingSortBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info) +} + +return this diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e02eadc1..fc68aef51 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,9 +42,10 @@ if(NBL_BUILD_EXAMPLES) # showcase use of FFT for post-FX Bloom effect add_subdirectory(11_FFT) # - add_subdirectory(12_MeshLoaders) - # - #add_subdirectory(13_MaterialCompiler EXCLUDE_FROM_ALL) + add_subdirectory(12_MeshLoaders EXCLUDE_FROM_ALL) + + # bitonic + add_subdirectory(13_BitonicSort) # Waiting for a refactor #add_subdirectory(27_PLYSTLDemo) @@ -96,19 +97,9 @@ if(NBL_BUILD_EXAMPLES) # we link common example api library and force examples to reuse its PCH foreach(T IN LISTS TARGETS) - get_target_property(TYPE ${T} TYPE) - if(NOT ${TYPE} MATCHES INTERFACE) - target_link_libraries(${T} PUBLIC ${NBL_EXAMPLES_API_TARGET}) - target_include_directories(${T} PUBLIC $) - set_target_properties(${T} PROPERTIES DISABLE_PRECOMPILE_HEADERS OFF) - target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}") - - if(NBL_EMBED_BUILTIN_RESOURCES) - LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsSource) - LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsInclude) - LINK_BUILTIN_RESOURCES_TO_TARGET(${T} NblExtExamplesAPIBuiltinsBuild) - endif() - endif() + target_link_libraries(${T} PUBLIC ${NBL_EXAMPLES_API_TARGET}) + target_include_directories(${T} PUBLIC $) + target_precompile_headers(${T} REUSE_FROM "${NBL_EXAMPLES_API_TARGET}") endforeach() NBL_ADJUST_FOLDERS(examples)