Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 37 additions & 4 deletions cpp/include/tensorrt_llm/runtime/virtualMemory.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@
#include "tensorrt_llm/runtime/iBuffer.h"
#include "tensorrt_llm/runtime/memoryCounters.h"

#include <atomic>
#include <cuda.h>
#include <map>
#include <mutex>
#include <numeric>
#include <unistd.h>
#include <utility>

Expand Down Expand Up @@ -466,7 +468,7 @@ class CudaVirtualMemoryAllocator
CudaVirtualMemoryManager& mManager;
std::string mTag;
CudaStreamPtr mBackStream;
std::size_t mPageSize;
std::atomic<std::size_t> mAlignment;
RestoreMode mMode;
bool mBackground{};

Expand All @@ -487,14 +489,45 @@ class CudaVirtualMemoryAllocator
: mManager(manager)
, mTag(std::move(tag))
, mBackStream(std::move(backStream))
, mPageSize(getpagesize())
, mAlignment(0)
, mMode(mode)
{
}

[[nodiscard]] std::size_t pageAligned(std::size_t n) const noexcept
[[nodiscard]] std::size_t aligned(std::size_t n, int device = 0)
{
return (n + mPageSize - 1) & ~(mPageSize - 1);
// Lazy loading the alignment, since CUDA driver may yet to be initialized when Configuration is
// constructed.
// We have one process for each GPU so caching the value is fine.
constexpr std::size_t loading = std::numeric_limits<std::size_t>::max();
std::size_t alignment = 0;
if (mAlignment.compare_exchange_strong(alignment, loading, std::memory_order_relaxed))
{
std::size_t gpuAlignment = 1;
CUmemAllocationProp const prop{CU_MEM_ALLOCATION_TYPE_PINNED, CU_MEM_HANDLE_TYPE_NONE,
{
CU_MEM_LOCATION_TYPE_DEVICE,
device,
}};
TLLM_CU_CHECK(
cuMemGetAllocationGranularity(&gpuAlignment, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
alignment = std::lcm(getpagesize(), gpuAlignment);
mAlignment.store(alignment, std::memory_order_relaxed);
}
else
{
// spin wait
while (alignment == loading)
{
#if defined(__x86_64__)
asm volatile("pause");
#elif defined(__aarch64__)
asm volatile("yield");
#endif
alignment = mAlignment.load(std::memory_order_relaxed);
}
}
return (n + alignment - 1) / alignment * alignment;
}

// Background configuration, used to indicate no virtual memory allocator is explicitly configured by the user.
Expand Down
12 changes: 6 additions & 6 deletions cpp/tensorrt_llm/runtime/virtualMemory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -339,11 +339,11 @@ static void* deviceptr_cast(CUdeviceptr ptr)
void CudaVirtualMemoryAllocator::allocate(Pointer* ptr, std::size_t n, int device) const
{
CUdeviceptr address{};
std::size_t const pageAlignedSize = mConfig->pageAligned(n);
TLLM_CU_CHECK(cuMemAddressReserve(&address, pageAlignedSize, 0, {}, 0));
std::size_t const alignedSize = mConfig->aligned(n, device);
TLLM_CU_CHECK(cuMemAddressReserve(&address, alignedSize, 0, {}, 0));

CUDAVirtualMemoryChunk::Configurators configurators;
configurators.push_back(std::make_unique<UnicastConfigurator>(address, n,
configurators.push_back(std::make_unique<UnicastConfigurator>(address, alignedSize,
CUmemAccessDesc{{
CU_MEM_LOCATION_TYPE_DEVICE,
device,
Expand Down Expand Up @@ -372,7 +372,7 @@ void CudaVirtualMemoryAllocator::allocate(Pointer* ptr, std::size_t n, int devic
CU_MEM_LOCATION_TYPE_DEVICE,
device,
}},
n),
alignedSize),
std::move(configurators));

*ptr = deviceptr_cast(address);
Expand All @@ -383,8 +383,8 @@ void CudaVirtualMemoryAllocator::deallocate(Pointer ptr, std::size_t n) const
auto const address = deviceptr_cast(ptr);
mConfig->mManager.remove(address);

std::size_t const pageAlignedSize = mConfig->pageAligned(n);
TLLM_CU_CHECK_FREE_RESOURCE(cuMemAddressFree(address, pageAlignedSize));
std::size_t const alignedSize = mConfig->aligned(n);
TLLM_CU_CHECK_FREE_RESOURCE(cuMemAddressFree(address, alignedSize));
}

} // namespace tensorrt_llm::runtime
Expand Down
36 changes: 36 additions & 0 deletions cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1570,3 +1570,39 @@ TEST_F(VirtualMemoryManagerTest, TestCudaVirtualMemoryAllocator)
ASSERT_EQ(memoryBegin, memoryAfterCleanup) << "Buffer destruction should free memory";
}
}

TEST_F(VirtualMemoryManagerTest, TestCudaVirtualMemoryAllocatorUnalignedSize)
{
std::size_t constexpr size = 64 * 1024 * 1024 + 1; // 64 MB + 1 byte
std::string tag = "test_allocator_tag";

// Create a CUDA stream for the allocator
CudaStream stream;
auto streamPtr = std::make_shared<CudaStream>(std::move(stream));

// Create configuration for the virtual address allocator
auto config = std::make_shared<CudaVirtualMemoryAllocator::Configuration>(
*mVMManager.get(), tag, CudaVirtualMemoryAllocator::RestoreMode::NONE, streamPtr);

auto memoryBegin = getCurrentProcessMemoryInfo();

// Create a buffer using the virtual address allocator
auto buffer = std::make_unique<VirtualAddressDeviceBuffer>(
size, nvinfer1::DataType::kINT8, CudaVirtualMemoryAllocator{config});

auto memoryAfterAllocation = getCurrentProcessMemoryInfo();
if (memoryInfoAvailable())
{
// Allocates a larger memory block than requested
ASSERT_LT(memoryBegin + size, memoryAfterAllocation) << "Buffer allocation does not allocate memory";
}

// Clean up by destroying the buffer (this should automatically clean up the virtual memory)
buffer.reset();

auto memoryAfterCleanup = getCurrentProcessMemoryInfo();
if (memoryInfoAvailable())
{
ASSERT_EQ(memoryBegin, memoryAfterCleanup) << "Buffer destruction should free memory";
}
}