[None][fix] Correct virtual memory allocation alignment

tongyuantongyu · tongyuantongyu · commit a35aac3df0b8 · 2025-11-26T17:55:47.000+08:00
Signed-off-by: Yuan Tong &lt;13075180+tongyuantongyu@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/runtime/virtualMemory.h b/cpp/include/tensorrt_llm/runtime/virtualMemory.h
@@ -22,9 +22,11 @@
 #include "tensorrt_llm/runtime/iBuffer.h"
 #include "tensorrt_llm/runtime/memoryCounters.h"
 
+#include <atomic>
 #include <cuda.h>
 #include <map>
 #include <mutex>
+#include <numeric>
 #include <unistd.h>
 #include <utility>
 
@@ -214,7 +216,7 @@ struct LocalCreator : CUDAVirtualMemoryChunk::Creator
     CUmemGenericAllocationHandle create() override
     {
         CUmemGenericAllocationHandle handle{};
-        TLLM_CU_CHECK(cuMemCreate(&handle, mSize, &mProp, 0));
+        TLLM_CU_CHECK_WITH_INFO(cuMemCreate(&handle, mSize, &mProp, 0), "allocating %zu bytes of memory", mSize);
         if constexpr (count)
         {
             MemoryCounters::getInstance().allocate(
@@ -466,7 +468,7 @@ class CudaVirtualMemoryAllocator
         CudaVirtualMemoryManager& mManager;
         std::string mTag;
         CudaStreamPtr mBackStream;
-        std::size_t mPageSize;
+        std::atomic<std::size_t> mAlignment;
         RestoreMode mMode;
         bool mBackground{};
 
@@ -487,14 +489,45 @@ class CudaVirtualMemoryAllocator
             : mManager(manager)
             , mTag(std::move(tag))
             , mBackStream(std::move(backStream))
-            , mPageSize(getpagesize())
+            , mAlignment(0)
             , mMode(mode)
         {
         }
 
-        [[nodiscard]] std::size_t pageAligned(std::size_t n) const noexcept
+        [[nodiscard]] std::size_t aligned(std::size_t n, int device = 0)
         {
-            return (n + mPageSize - 1) & ~(mPageSize - 1);
+            // Lazy loading the alignment, since CUDA driver may yet to be initialized when Configuration is
+            // constructed.
+            // We have one process for each GPU so caching the value is fine.
+            constexpr std::size_t loading = std::numeric_limits<std::size_t>::max();
+            std::size_t alignment = 0;
+            if (mAlignment.compare_exchange_strong(alignment, loading, std::memory_order_relaxed))
+            {
+                std::size_t gpuAlignment = 1;
+                CUmemAllocationProp const prop{CU_MEM_ALLOCATION_TYPE_PINNED, CU_MEM_HANDLE_TYPE_NONE,
+                    {
+                        CU_MEM_LOCATION_TYPE_DEVICE,
+                        device,
+                    }};
+                TLLM_CU_CHECK(
+                    cuMemGetAllocationGranularity(&gpuAlignment, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+                alignment = std::lcm(getpagesize(), gpuAlignment);
+                mAlignment.store(alignment, std::memory_order_relaxed);
+            }
+            else
+            {
+                // spin wait
+                while (alignment == loading)
+                {
+#if defined(__x86_64__)
+                    asm volatile("pause");
+#elif defined(__aarch64__)
+                    asm volatile("yield");
+#endif
+                    alignment = mAlignment.load(std::memory_order_relaxed);
+                }
+            }
+            return (n + alignment - 1) / alignment * alignment;
         }
 
         // Background configuration, used to indicate no virtual memory allocator is explicitly configured by the user.
diff --git a/cpp/tensorrt_llm/common/cudaDriverWrapper.h b/cpp/tensorrt_llm/common/cudaDriverWrapper.h
@@ -141,15 +141,22 @@ class CUDADriverWrapper
 };
 
 template <typename T>
-void checkDriver(
-    T result, CUDADriverWrapper const& wrap, char const* const func, char const* const file, int const line)
+void checkDriver(T result, CUDADriverWrapper const& wrap, char const* const func, char const* const file,
+    int const line, char const* info = nullptr)
 {
     if (result)
     {
         char const* errorName = nullptr;
         char const* errorString = nullptr;
         wrap.cuGetErrorName(result, &errorName);
         wrap.cuGetErrorString(result, &errorString);
+        if (info != nullptr)
+        {
+            throw TllmException(file, line,
+                fmtstr(
+                    "[TensorRT-LLM][ERROR] CUDA driver error in %s (%s): %s: %s.", func, info, errorName, errorString)
+                    .c_str());
+        }
         throw TllmException(file, line,
             fmtstr("[TensorRT-LLM][ERROR] CUDA driver error in %s: %s: %s.", func, errorName, errorString).c_str());
     }
@@ -177,6 +184,13 @@ void checkDriverExitSafe(T result, char const* const func, char const* const fil
             (stat), *tensorrt_llm::common::CUDADriverWrapper::getInstance(), #stat, __FILE__, __LINE__);               \
     } while (0)
 
+#define TLLM_CU_CHECK_WITH_INFO(stat, info, ...)                                                                       \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        tensorrt_llm::common::checkDriver((stat), *tensorrt_llm::common::CUDADriverWrapper::getInstance(), #stat,      \
+            __FILE__, __LINE__, tensorrt_llm::common::fmtstr(info, ##__VA_ARGS__).c_str());                            \
+    } while (0)
+
 // Avoid using CUDADriverWrapper when freeing resource, during which the global instance may already be freed.
 #define TLLM_CU_CHECK_FREE_RESOURCE(stat)                                                                              \
     do                                                                                                                 \
diff --git a/cpp/tensorrt_llm/runtime/virtualMemory.cpp b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
@@ -339,11 +339,12 @@ static void* deviceptr_cast(CUdeviceptr ptr)
 void CudaVirtualMemoryAllocator::allocate(Pointer* ptr, std::size_t n, int device) const
 {
     CUdeviceptr address{};
-    std::size_t const pageAlignedSize = mConfig->pageAligned(n);
-    TLLM_CU_CHECK(cuMemAddressReserve(&address, pageAlignedSize, 0, {}, 0));
+    std::size_t const pageAlignedSize = mConfig->aligned(n, device);
+    TLLM_CU_CHECK_WITH_INFO(cuMemAddressReserve(&address, pageAlignedSize, 0, {}, 0),
+        "allocating %zu bytes of address space", pageAlignedSize);
 
     CUDAVirtualMemoryChunk::Configurators configurators;
-    configurators.push_back(std::make_unique<UnicastConfigurator>(address, n,
+    configurators.push_back(std::make_unique<UnicastConfigurator>(address, pageAlignedSize,
         CUmemAccessDesc{{
                             CU_MEM_LOCATION_TYPE_DEVICE,
                             device,
@@ -372,7 +373,7 @@ void CudaVirtualMemoryAllocator::allocate(Pointer* ptr, std::size_t n, int devic
                                                  CU_MEM_LOCATION_TYPE_DEVICE,
                                                  device,
                                              }},
-            n),
+            pageAlignedSize),
         std::move(configurators));
 
     *ptr = deviceptr_cast(address);
@@ -383,7 +384,7 @@ void CudaVirtualMemoryAllocator::deallocate(Pointer ptr, std::size_t n) const
     auto const address = deviceptr_cast(ptr);
     mConfig->mManager.remove(address);
 
-    std::size_t const pageAlignedSize = mConfig->pageAligned(n);
+    std::size_t const pageAlignedSize = mConfig->aligned(n);
     TLLM_CU_CHECK_FREE_RESOURCE(cuMemAddressFree(address, pageAlignedSize));
 }
 
diff --git a/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp b/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp
@@ -1570,3 +1570,39 @@ TEST_F(VirtualMemoryManagerTest, TestCudaVirtualMemoryAllocator)
         ASSERT_EQ(memoryBegin, memoryAfterCleanup) << "Buffer destruction should free memory";
     }
 }
+
+TEST_F(VirtualMemoryManagerTest, TestCudaVirtualMemoryAllocatorUnalignedSize)
+{
+    std::size_t constexpr size = 64 * 1024 * 1024 + 1; // 64 MB + 1 byte
+    std::string tag = "test_allocator_tag";
+
+    // Create a CUDA stream for the allocator
+    CudaStream stream;
+    auto streamPtr = std::make_shared<CudaStream>(std::move(stream));
+
+    // Create configuration for the virtual address allocator
+    auto config = std::make_shared<CudaVirtualMemoryAllocator::Configuration>(
+        *mVMManager.get(), tag, CudaVirtualMemoryAllocator::RestoreMode::NONE, streamPtr);
+
+    auto memoryBegin = getCurrentProcessMemoryInfo();
+
+    // Create a buffer using the virtual address allocator
+    auto buffer = std::make_unique<VirtualAddressDeviceBuffer>(
+        size, nvinfer1::DataType::kINT8, CudaVirtualMemoryAllocator{config});
+
+    auto memoryAfterAllocation = getCurrentProcessMemoryInfo();
+    if (memoryInfoAvailable())
+    {
+        // Allocates a larger memory block than requested
+        ASSERT_LT(memoryBegin + size, memoryAfterAllocation) << "Buffer allocation does not allocate memory";
+    }
+
+    // Clean up by destroying the buffer (this should automatically clean up the virtual memory)
+    buffer.reset();
+
+    auto memoryAfterCleanup = getCurrentProcessMemoryInfo();
+    if (memoryInfoAvailable())
+    {
+        ASSERT_EQ(memoryBegin, memoryAfterCleanup) << "Buffer destruction should free memory";
+    }
+}