NVIDIA · Superjomn · Dec 1, 2025 · Nov 26, 2025
diff --git a/cpp/include/tensorrt_llm/runtime/virtualMemory.h b/cpp/include/tensorrt_llm/runtime/virtualMemory.h
@@ -22,9 +22,11 @@
 #include "tensorrt_llm/runtime/iBuffer.h"
 #include "tensorrt_llm/runtime/memoryCounters.h"
 
+#include <atomic>
 #include <cuda.h>
 #include <map>
 #include <mutex>
+#include <numeric>
 #include <unistd.h>
 #include <utility>
 
@@ -466,7 +468,7 @@ class CudaVirtualMemoryAllocator
         CudaVirtualMemoryManager& mManager;
         std::string mTag;
         CudaStreamPtr mBackStream;
-        std::size_t mPageSize;
+        std::atomic<std::size_t> mAlignment;
         RestoreMode mMode;
         bool mBackground{};
 
@@ -487,14 +489,45 @@ class CudaVirtualMemoryAllocator
             : mManager(manager)
             , mTag(std::move(tag))
             , mBackStream(std::move(backStream))
-            , mPageSize(getpagesize())
+            , mAlignment(0)
             , mMode(mode)
         {
         }
 
-        [[nodiscard]] std::size_t pageAligned(std::size_t n) const noexcept
+        [[nodiscard]] std::size_t aligned(std::size_t n, int device = 0)
         {
-            return (n + mPageSize - 1) & ~(mPageSize - 1);
+            // Lazy loading the alignment, since CUDA driver may yet to be initialized when Configuration is
+            // constructed.
+            // We have one process for each GPU so caching the value is fine.
+            constexpr std::size_t loading = std::numeric_limits<std::size_t>::max();
+            std::size_t alignment = 0;
+            if (mAlignment.compare_exchange_strong(alignment, loading, std::memory_order_relaxed))
+            {
+                std::size_t gpuAlignment = 1;
+                CUmemAllocationProp const prop{CU_MEM_ALLOCATION_TYPE_PINNED, CU_MEM_HANDLE_TYPE_NONE,
+                    {
+                        CU_MEM_LOCATION_TYPE_DEVICE,
+                        device,
+                    }};
+                TLLM_CU_CHECK(
+                    cuMemGetAllocationGranularity(&gpuAlignment, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+                alignment = std::lcm(getpagesize(), gpuAlignment);
+                mAlignment.store(alignment, std::memory_order_relaxed);
+            }
+            else
+            {
+                // spin wait
+                while (alignment == loading)
+                {
+#if defined(__x86_64__)
+                    asm volatile("pause");
+#elif defined(__aarch64__)
+                    asm volatile("yield");
+#endif
+                    alignment = mAlignment.load(std::memory_order_relaxed);
+                }
+            }
+            return (n + alignment - 1) / alignment * alignment;
         }
 
         // Background configuration, used to indicate no virtual memory allocator is explicitly configured by the user.

diff --git a/cpp/tensorrt_llm/runtime/virtualMemory.cpp b/cpp/tensorrt_llm/runtime/virtualMemory.cpp
@@ -339,11 +339,11 @@ static void* deviceptr_cast(CUdeviceptr ptr)
 void CudaVirtualMemoryAllocator::allocate(Pointer* ptr, std::size_t n, int device) const
 {
     CUdeviceptr address{};
-    std::size_t const pageAlignedSize = mConfig->pageAligned(n);
-    TLLM_CU_CHECK(cuMemAddressReserve(&address, pageAlignedSize, 0, {}, 0));
+    std::size_t const alignedSize = mConfig->aligned(n, device);
+    TLLM_CU_CHECK(cuMemAddressReserve(&address, alignedSize, 0, {}, 0));
 
     CUDAVirtualMemoryChunk::Configurators configurators;
-    configurators.push_back(std::make_unique<UnicastConfigurator>(address, n,
+    configurators.push_back(std::make_unique<UnicastConfigurator>(address, alignedSize,
         CUmemAccessDesc{{
                             CU_MEM_LOCATION_TYPE_DEVICE,
                             device,
@@ -372,7 +372,7 @@ void CudaVirtualMemoryAllocator::allocate(Pointer* ptr, std::size_t n, int devic
                                                  CU_MEM_LOCATION_TYPE_DEVICE,
                                                  device,
                                              }},
-            n),
+            alignedSize),
         std::move(configurators));
 
     *ptr = deviceptr_cast(address);
@@ -383,8 +383,8 @@ void CudaVirtualMemoryAllocator::deallocate(Pointer ptr, std::size_t n) const
     auto const address = deviceptr_cast(ptr);
     mConfig->mManager.remove(address);
 
-    std::size_t const pageAlignedSize = mConfig->pageAligned(n);
-    TLLM_CU_CHECK_FREE_RESOURCE(cuMemAddressFree(address, pageAlignedSize));
+    std::size_t const alignedSize = mConfig->aligned(n);
+    TLLM_CU_CHECK_FREE_RESOURCE(cuMemAddressFree(address, alignedSize));
 }
 
 } // namespace tensorrt_llm::runtime

diff --git a/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp b/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp
@@ -1570,3 +1570,39 @@ TEST_F(VirtualMemoryManagerTest, TestCudaVirtualMemoryAllocator)
         ASSERT_EQ(memoryBegin, memoryAfterCleanup) << "Buffer destruction should free memory";
     }
 }
+
+TEST_F(VirtualMemoryManagerTest, TestCudaVirtualMemoryAllocatorUnalignedSize)
+{
+    std::size_t constexpr size = 64 * 1024 * 1024 + 1; // 64 MB + 1 byte
+    std::string tag = "test_allocator_tag";
+
+    // Create a CUDA stream for the allocator
+    CudaStream stream;
+    auto streamPtr = std::make_shared<CudaStream>(std::move(stream));
+
+    // Create configuration for the virtual address allocator
+    auto config = std::make_shared<CudaVirtualMemoryAllocator::Configuration>(
+        *mVMManager.get(), tag, CudaVirtualMemoryAllocator::RestoreMode::NONE, streamPtr);
+
+    auto memoryBegin = getCurrentProcessMemoryInfo();
+
+    // Create a buffer using the virtual address allocator
+    auto buffer = std::make_unique<VirtualAddressDeviceBuffer>(
+        size, nvinfer1::DataType::kINT8, CudaVirtualMemoryAllocator{config});
+
+    auto memoryAfterAllocation = getCurrentProcessMemoryInfo();
+    if (memoryInfoAvailable())
+    {
+        // Allocates a larger memory block than requested
+        ASSERT_LT(memoryBegin + size, memoryAfterAllocation) << "Buffer allocation does not allocate memory";
+    }
+
+    // Clean up by destroying the buffer (this should automatically clean up the virtual memory)
+    buffer.reset();
+
+    auto memoryAfterCleanup = getCurrentProcessMemoryInfo();
+    if (memoryInfoAvailable())
+    {
+        ASSERT_EQ(memoryBegin, memoryAfterCleanup) << "Buffer destruction should free memory";
+    }
+}