Skip to content

Commit a35aac3

Browse files
[None][fix] Correct virtual memory allocation alignment
Signed-off-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com>
1 parent b10137f commit a35aac3

File tree

4 files changed

+96
-12
lines changed

4 files changed

+96
-12
lines changed

cpp/include/tensorrt_llm/runtime/virtualMemory.h

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,11 @@
2222
#include "tensorrt_llm/runtime/iBuffer.h"
2323
#include "tensorrt_llm/runtime/memoryCounters.h"
2424

25+
#include <atomic>
2526
#include <cuda.h>
2627
#include <map>
2728
#include <mutex>
29+
#include <numeric>
2830
#include <unistd.h>
2931
#include <utility>
3032

@@ -214,7 +216,7 @@ struct LocalCreator : CUDAVirtualMemoryChunk::Creator
214216
CUmemGenericAllocationHandle create() override
215217
{
216218
CUmemGenericAllocationHandle handle{};
217-
TLLM_CU_CHECK(cuMemCreate(&handle, mSize, &mProp, 0));
219+
TLLM_CU_CHECK_WITH_INFO(cuMemCreate(&handle, mSize, &mProp, 0), "allocating %zu bytes of memory", mSize);
218220
if constexpr (count)
219221
{
220222
MemoryCounters::getInstance().allocate(
@@ -466,7 +468,7 @@ class CudaVirtualMemoryAllocator
466468
CudaVirtualMemoryManager& mManager;
467469
std::string mTag;
468470
CudaStreamPtr mBackStream;
469-
std::size_t mPageSize;
471+
std::atomic<std::size_t> mAlignment;
470472
RestoreMode mMode;
471473
bool mBackground{};
472474

@@ -487,14 +489,45 @@ class CudaVirtualMemoryAllocator
487489
: mManager(manager)
488490
, mTag(std::move(tag))
489491
, mBackStream(std::move(backStream))
490-
, mPageSize(getpagesize())
492+
, mAlignment(0)
491493
, mMode(mode)
492494
{
493495
}
494496

495-
[[nodiscard]] std::size_t pageAligned(std::size_t n) const noexcept
497+
[[nodiscard]] std::size_t aligned(std::size_t n, int device = 0)
496498
{
497-
return (n + mPageSize - 1) & ~(mPageSize - 1);
499+
// Lazy loading the alignment, since CUDA driver may yet to be initialized when Configuration is
500+
// constructed.
501+
// We have one process for each GPU so caching the value is fine.
502+
constexpr std::size_t loading = std::numeric_limits<std::size_t>::max();
503+
std::size_t alignment = 0;
504+
if (mAlignment.compare_exchange_strong(alignment, loading, std::memory_order_relaxed))
505+
{
506+
std::size_t gpuAlignment = 1;
507+
CUmemAllocationProp const prop{CU_MEM_ALLOCATION_TYPE_PINNED, CU_MEM_HANDLE_TYPE_NONE,
508+
{
509+
CU_MEM_LOCATION_TYPE_DEVICE,
510+
device,
511+
}};
512+
TLLM_CU_CHECK(
513+
cuMemGetAllocationGranularity(&gpuAlignment, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
514+
alignment = std::lcm(getpagesize(), gpuAlignment);
515+
mAlignment.store(alignment, std::memory_order_relaxed);
516+
}
517+
else
518+
{
519+
// spin wait
520+
while (alignment == loading)
521+
{
522+
#if defined(__x86_64__)
523+
asm volatile("pause");
524+
#elif defined(__aarch64__)
525+
asm volatile("yield");
526+
#endif
527+
alignment = mAlignment.load(std::memory_order_relaxed);
528+
}
529+
}
530+
return (n + alignment - 1) / alignment * alignment;
498531
}
499532

500533
// Background configuration, used to indicate no virtual memory allocator is explicitly configured by the user.

cpp/tensorrt_llm/common/cudaDriverWrapper.h

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,15 +141,22 @@ class CUDADriverWrapper
141141
};
142142

143143
template <typename T>
144-
void checkDriver(
145-
T result, CUDADriverWrapper const& wrap, char const* const func, char const* const file, int const line)
144+
void checkDriver(T result, CUDADriverWrapper const& wrap, char const* const func, char const* const file,
145+
int const line, char const* info = nullptr)
146146
{
147147
if (result)
148148
{
149149
char const* errorName = nullptr;
150150
char const* errorString = nullptr;
151151
wrap.cuGetErrorName(result, &errorName);
152152
wrap.cuGetErrorString(result, &errorString);
153+
if (info != nullptr)
154+
{
155+
throw TllmException(file, line,
156+
fmtstr(
157+
"[TensorRT-LLM][ERROR] CUDA driver error in %s (%s): %s: %s.", func, info, errorName, errorString)
158+
.c_str());
159+
}
153160
throw TllmException(file, line,
154161
fmtstr("[TensorRT-LLM][ERROR] CUDA driver error in %s: %s: %s.", func, errorName, errorString).c_str());
155162
}
@@ -177,6 +184,13 @@ void checkDriverExitSafe(T result, char const* const func, char const* const fil
177184
(stat), *tensorrt_llm::common::CUDADriverWrapper::getInstance(), #stat, __FILE__, __LINE__); \
178185
} while (0)
179186

187+
#define TLLM_CU_CHECK_WITH_INFO(stat, info, ...) \
188+
do \
189+
{ \
190+
tensorrt_llm::common::checkDriver((stat), *tensorrt_llm::common::CUDADriverWrapper::getInstance(), #stat, \
191+
__FILE__, __LINE__, tensorrt_llm::common::fmtstr(info, ##__VA_ARGS__).c_str()); \
192+
} while (0)
193+
180194
// Avoid using CUDADriverWrapper when freeing resource, during which the global instance may already be freed.
181195
#define TLLM_CU_CHECK_FREE_RESOURCE(stat) \
182196
do \

cpp/tensorrt_llm/runtime/virtualMemory.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -339,11 +339,12 @@ static void* deviceptr_cast(CUdeviceptr ptr)
339339
void CudaVirtualMemoryAllocator::allocate(Pointer* ptr, std::size_t n, int device) const
340340
{
341341
CUdeviceptr address{};
342-
std::size_t const pageAlignedSize = mConfig->pageAligned(n);
343-
TLLM_CU_CHECK(cuMemAddressReserve(&address, pageAlignedSize, 0, {}, 0));
342+
std::size_t const pageAlignedSize = mConfig->aligned(n, device);
343+
TLLM_CU_CHECK_WITH_INFO(cuMemAddressReserve(&address, pageAlignedSize, 0, {}, 0),
344+
"allocating %zu bytes of address space", pageAlignedSize);
344345

345346
CUDAVirtualMemoryChunk::Configurators configurators;
346-
configurators.push_back(std::make_unique<UnicastConfigurator>(address, n,
347+
configurators.push_back(std::make_unique<UnicastConfigurator>(address, pageAlignedSize,
347348
CUmemAccessDesc{{
348349
CU_MEM_LOCATION_TYPE_DEVICE,
349350
device,
@@ -372,7 +373,7 @@ void CudaVirtualMemoryAllocator::allocate(Pointer* ptr, std::size_t n, int devic
372373
CU_MEM_LOCATION_TYPE_DEVICE,
373374
device,
374375
}},
375-
n),
376+
pageAlignedSize),
376377
std::move(configurators));
377378

378379
*ptr = deviceptr_cast(address);
@@ -383,7 +384,7 @@ void CudaVirtualMemoryAllocator::deallocate(Pointer ptr, std::size_t n) const
383384
auto const address = deviceptr_cast(ptr);
384385
mConfig->mManager.remove(address);
385386

386-
std::size_t const pageAlignedSize = mConfig->pageAligned(n);
387+
std::size_t const pageAlignedSize = mConfig->aligned(n);
387388
TLLM_CU_CHECK_FREE_RESOURCE(cuMemAddressFree(address, pageAlignedSize));
388389
}
389390

cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1570,3 +1570,39 @@ TEST_F(VirtualMemoryManagerTest, TestCudaVirtualMemoryAllocator)
15701570
ASSERT_EQ(memoryBegin, memoryAfterCleanup) << "Buffer destruction should free memory";
15711571
}
15721572
}
1573+
1574+
TEST_F(VirtualMemoryManagerTest, TestCudaVirtualMemoryAllocatorUnalignedSize)
1575+
{
1576+
std::size_t constexpr size = 64 * 1024 * 1024 + 1; // 64 MB + 1 byte
1577+
std::string tag = "test_allocator_tag";
1578+
1579+
// Create a CUDA stream for the allocator
1580+
CudaStream stream;
1581+
auto streamPtr = std::make_shared<CudaStream>(std::move(stream));
1582+
1583+
// Create configuration for the virtual address allocator
1584+
auto config = std::make_shared<CudaVirtualMemoryAllocator::Configuration>(
1585+
*mVMManager.get(), tag, CudaVirtualMemoryAllocator::RestoreMode::NONE, streamPtr);
1586+
1587+
auto memoryBegin = getCurrentProcessMemoryInfo();
1588+
1589+
// Create a buffer using the virtual address allocator
1590+
auto buffer = std::make_unique<VirtualAddressDeviceBuffer>(
1591+
size, nvinfer1::DataType::kINT8, CudaVirtualMemoryAllocator{config});
1592+
1593+
auto memoryAfterAllocation = getCurrentProcessMemoryInfo();
1594+
if (memoryInfoAvailable())
1595+
{
1596+
// Allocates a larger memory block than requested
1597+
ASSERT_LT(memoryBegin + size, memoryAfterAllocation) << "Buffer allocation does not allocate memory";
1598+
}
1599+
1600+
// Clean up by destroying the buffer (this should automatically clean up the virtual memory)
1601+
buffer.reset();
1602+
1603+
auto memoryAfterCleanup = getCurrentProcessMemoryInfo();
1604+
if (memoryInfoAvailable())
1605+
{
1606+
ASSERT_EQ(memoryBegin, memoryAfterCleanup) << "Buffer destruction should free memory";
1607+
}
1608+
}

0 commit comments

Comments
 (0)