From 30d67dad7c12d29e37f9fcad40200198caa29ab2 Mon Sep 17 00:00:00 2001 From: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com> Date: Wed, 26 Nov 2025 16:59:00 +0800 Subject: [PATCH] [None][fix] Correct virtual memory allocation alignment Signed-off-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com> --- .../tensorrt_llm/runtime/virtualMemory.h | 41 +++++++++++++++++-- cpp/tensorrt_llm/runtime/virtualMemory.cpp | 12 +++--- .../unit_tests/runtime/virtualMemoryTest.cpp | 36 ++++++++++++++++ 3 files changed, 79 insertions(+), 10 deletions(-) diff --git a/cpp/include/tensorrt_llm/runtime/virtualMemory.h b/cpp/include/tensorrt_llm/runtime/virtualMemory.h index c39a60995eb..bc3bf935db7 100644 --- a/cpp/include/tensorrt_llm/runtime/virtualMemory.h +++ b/cpp/include/tensorrt_llm/runtime/virtualMemory.h @@ -22,9 +22,11 @@ #include "tensorrt_llm/runtime/iBuffer.h" #include "tensorrt_llm/runtime/memoryCounters.h" +#include #include #include #include +#include #include #include @@ -466,7 +468,7 @@ class CudaVirtualMemoryAllocator CudaVirtualMemoryManager& mManager; std::string mTag; CudaStreamPtr mBackStream; - std::size_t mPageSize; + std::atomic mAlignment; RestoreMode mMode; bool mBackground{}; @@ -487,14 +489,45 @@ class CudaVirtualMemoryAllocator : mManager(manager) , mTag(std::move(tag)) , mBackStream(std::move(backStream)) - , mPageSize(getpagesize()) + , mAlignment(0) , mMode(mode) { } - [[nodiscard]] std::size_t pageAligned(std::size_t n) const noexcept + [[nodiscard]] std::size_t aligned(std::size_t n, int device = 0) { - return (n + mPageSize - 1) & ~(mPageSize - 1); + // Lazy loading the alignment, since CUDA driver may yet to be initialized when Configuration is + // constructed. + // We have one process for each GPU so caching the value is fine. + constexpr std::size_t loading = std::numeric_limits::max(); + std::size_t alignment = 0; + if (mAlignment.compare_exchange_strong(alignment, loading, std::memory_order_relaxed)) + { + std::size_t gpuAlignment = 1; + CUmemAllocationProp const prop{CU_MEM_ALLOCATION_TYPE_PINNED, CU_MEM_HANDLE_TYPE_NONE, + { + CU_MEM_LOCATION_TYPE_DEVICE, + device, + }}; + TLLM_CU_CHECK( + cuMemGetAllocationGranularity(&gpuAlignment, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); + alignment = std::lcm(getpagesize(), gpuAlignment); + mAlignment.store(alignment, std::memory_order_relaxed); + } + else + { + // spin wait + while (alignment == loading) + { +#if defined(__x86_64__) + asm volatile("pause"); +#elif defined(__aarch64__) + asm volatile("yield"); +#endif + alignment = mAlignment.load(std::memory_order_relaxed); + } + } + return (n + alignment - 1) / alignment * alignment; } // Background configuration, used to indicate no virtual memory allocator is explicitly configured by the user. diff --git a/cpp/tensorrt_llm/runtime/virtualMemory.cpp b/cpp/tensorrt_llm/runtime/virtualMemory.cpp index 488da30d653..c6e987947c3 100644 --- a/cpp/tensorrt_llm/runtime/virtualMemory.cpp +++ b/cpp/tensorrt_llm/runtime/virtualMemory.cpp @@ -339,11 +339,11 @@ static void* deviceptr_cast(CUdeviceptr ptr) void CudaVirtualMemoryAllocator::allocate(Pointer* ptr, std::size_t n, int device) const { CUdeviceptr address{}; - std::size_t const pageAlignedSize = mConfig->pageAligned(n); - TLLM_CU_CHECK(cuMemAddressReserve(&address, pageAlignedSize, 0, {}, 0)); + std::size_t const alignedSize = mConfig->aligned(n, device); + TLLM_CU_CHECK(cuMemAddressReserve(&address, alignedSize, 0, {}, 0)); CUDAVirtualMemoryChunk::Configurators configurators; - configurators.push_back(std::make_unique(address, n, + configurators.push_back(std::make_unique(address, alignedSize, CUmemAccessDesc{{ CU_MEM_LOCATION_TYPE_DEVICE, device, @@ -372,7 +372,7 @@ void CudaVirtualMemoryAllocator::allocate(Pointer* ptr, std::size_t n, int devic CU_MEM_LOCATION_TYPE_DEVICE, device, }}, - n), + alignedSize), std::move(configurators)); *ptr = deviceptr_cast(address); @@ -383,8 +383,8 @@ void CudaVirtualMemoryAllocator::deallocate(Pointer ptr, std::size_t n) const auto const address = deviceptr_cast(ptr); mConfig->mManager.remove(address); - std::size_t const pageAlignedSize = mConfig->pageAligned(n); - TLLM_CU_CHECK_FREE_RESOURCE(cuMemAddressFree(address, pageAlignedSize)); + std::size_t const alignedSize = mConfig->aligned(n); + TLLM_CU_CHECK_FREE_RESOURCE(cuMemAddressFree(address, alignedSize)); } } // namespace tensorrt_llm::runtime diff --git a/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp b/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp index 970a05299b1..a4a6e55e850 100644 --- a/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp +++ b/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp @@ -1570,3 +1570,39 @@ TEST_F(VirtualMemoryManagerTest, TestCudaVirtualMemoryAllocator) ASSERT_EQ(memoryBegin, memoryAfterCleanup) << "Buffer destruction should free memory"; } } + +TEST_F(VirtualMemoryManagerTest, TestCudaVirtualMemoryAllocatorUnalignedSize) +{ + std::size_t constexpr size = 64 * 1024 * 1024 + 1; // 64 MB + 1 byte + std::string tag = "test_allocator_tag"; + + // Create a CUDA stream for the allocator + CudaStream stream; + auto streamPtr = std::make_shared(std::move(stream)); + + // Create configuration for the virtual address allocator + auto config = std::make_shared( + *mVMManager.get(), tag, CudaVirtualMemoryAllocator::RestoreMode::NONE, streamPtr); + + auto memoryBegin = getCurrentProcessMemoryInfo(); + + // Create a buffer using the virtual address allocator + auto buffer = std::make_unique( + size, nvinfer1::DataType::kINT8, CudaVirtualMemoryAllocator{config}); + + auto memoryAfterAllocation = getCurrentProcessMemoryInfo(); + if (memoryInfoAvailable()) + { + // Allocates a larger memory block than requested + ASSERT_LT(memoryBegin + size, memoryAfterAllocation) << "Buffer allocation does not allocate memory"; + } + + // Clean up by destroying the buffer (this should automatically clean up the virtual memory) + buffer.reset(); + + auto memoryAfterCleanup = getCurrentProcessMemoryInfo(); + if (memoryInfoAvailable()) + { + ASSERT_EQ(memoryBegin, memoryAfterCleanup) << "Buffer destruction should free memory"; + } +}