Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
099c502
[OpenMP] Add parser/semantic support for dyn_groupprivate clause
kevinsala Aug 8, 2025
fa3c742
[OpenMP][Offload] Add offload runtime support for dyn_groupprivate cl…
kevinsala Aug 8, 2025
f66e5fa
[OpenMP] Add codegen support for dyn_groupprivate clause
kevinsala Aug 8, 2025
f20f4ba
Add fixes
kevinsala Aug 10, 2025
c34e062
[OpenMP][Flang] Add empty clause support for dyn_groupprivate in Flang
kevinsala Aug 17, 2025
96a73ab
Merge branch 'main' into users/kevinsala/omp-dyn-groupprivate-pr
kparzysz Aug 18, 2025
0a7a96d
Fix merge in ClauseT.h
kparzysz Aug 18, 2025
b5b8439
more merge fixes
kparzysz Aug 18, 2025
6e4c547
more merge fixes
kparzysz Aug 18, 2025
776401c
Merge remote-tracking branch 'upstream/main' into users/kevinsala/omp…
kevinsala Aug 22, 2025
84fc963
Add fixes and improvements after merge
kevinsala Aug 25, 2025
00550f9
Merge remote-tracking branch 'upstream/users/kevinsala/omp-dyn-groupp…
kevinsala Aug 25, 2025
c698b84
Merge remote-tracking branch 'upstream/users/kevinsala/omp-dyn-groupp…
kevinsala Aug 25, 2025
86f0cf0
Update syntax for fallback complex modifier
kevinsala Oct 23, 2025
86d4e6d
Merge remote-tracking branch 'upstream/main' into users/kevinsala/omp…
kevinsala Oct 24, 2025
f9eee16
Merge branch 'users/kevinsala/omp-dyn-groupprivate-pr' into users/kev…
kevinsala Oct 24, 2025
3a2fe70
Update for fallback complex modifier
kevinsala Oct 25, 2025
d4bf656
Merge branch 'users/kevinsala/omp-dyn-groupprivate-codegen-pr' into u…
kevinsala Oct 26, 2025
5f68ea0
Add support for null fallback
kevinsala Oct 26, 2025
9fb6e27
Fix test
kevinsala Oct 27, 2025
4662a4f
Fix parsing
kevinsala Oct 27, 2025
64e7abc
Fix format
kevinsala Oct 27, 2025
3307ec1
Merge branch 'users/kevinsala/omp-dyn-groupprivate-pr' into users/kev…
kevinsala Oct 27, 2025
968cc69
Merge branch 'users/kevinsala/omp-dyn-groupprivate-codegen-pr' into u…
kevinsala Oct 27, 2025
7b53c9a
Fix review comments
kevinsala Oct 27, 2025
79b34f1
Fix more review comments
kevinsala Oct 27, 2025
c4905e0
Fix initialization of a fallback variable
kevinsala Oct 30, 2025
b1930db
Merge branch 'users/kevinsala/omp-dyn-groupprivate-codegen-pr' into u…
kevinsala Oct 30, 2025
c439e44
Add codegen test for dyn_groupprivate
kevinsala Oct 30, 2025
407e41f
Merge branch 'users/kevinsala/omp-dyn-groupprivate-codegen-pr' into u…
kevinsala Oct 30, 2025
b76e32c
Fix review comments
kevinsala Nov 2, 2025
03db991
Merge branch 'users/kevinsala/omp-dyn-groupprivate-pr' into users/kev…
kevinsala Nov 2, 2025
944b7e7
Merge branch 'users/kevinsala/omp-dyn-groupprivate-codegen-pr' into u…
kevinsala Nov 2, 2025
a3cd7ef
Add cgroup mem parameters in createTarget
kevinsala Nov 4, 2025
0fa3d30
Merge remote-tracking branch 'upstream/main' into users/kevinsala/omp…
kevinsala Nov 7, 2025
a59d104
Merge branch 'users/kevinsala/omp-dyn-groupprivate-codegen-pr' into u…
kevinsala Nov 7, 2025
42eaac1
Fix format
kevinsala Nov 8, 2025
81f2225
Merge remote-tracking branch 'upstream/main' into users/kevinsala/omp…
kevinsala Nov 9, 2025
0ead633
Merge branch 'users/kevinsala/omp-dyn-groupprivate-codegen-pr' into u…
kevinsala Nov 9, 2025
0062013
Merge remote-tracking branch 'upstream/main' into users/kevinsala/omp…
kevinsala Nov 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions offload/include/Shared/APITypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,9 @@ struct KernelArgsTy {
struct {
uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
uint64_t Unused : 62;
} Flags = {0, 0, 0};
uint64_t DynCGroupMemFallback : 2; // The fallback for dynamic cgroup mem.
uint64_t Unused : 60;
} Flags = {0, 0, 0, 0};
// The number of teams (for x,y,z dimension).
uint32_t NumTeams[3] = {0, 0, 0};
// The number of threads (for x,y,z dimension).
Expand Down
17 changes: 16 additions & 1 deletion offload/include/Shared/Environment.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,25 @@ struct KernelEnvironmentTy {
DynamicEnvironmentTy *DynamicEnv = nullptr;
};

/// The fallback types for the dynamic cgroup memory.
enum class DynCGroupMemFallbackType : unsigned char {
/// None. Used for indicating that no fallback was triggered.
None = 0,
/// Abort the execution.
Abort = None,
/// Return null pointer.
Null = 1,
/// Allocate from a implementation defined memory space.
DefaultMem = 2
};

struct KernelLaunchEnvironmentTy {
void *ReductionBuffer = nullptr;
void *DynCGroupMemFbPtr = nullptr;
uint32_t ReductionCnt = 0;
uint32_t ReductionIterCnt = 0;
void *ReductionBuffer = nullptr;
uint32_t DynCGroupMemSize = 0;
DynCGroupMemFallbackType DynCGroupMemFb = DynCGroupMemFallbackType::None;
};

#endif // OMPTARGET_SHARED_ENVIRONMENT_H
3 changes: 3 additions & 0 deletions offload/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,9 @@ struct DeviceTy {
/// Indicate that there are pending images for this device or not.
void setHasPendingImages(bool V) { HasPendingImages = V; }

/// Get the maximum shared memory per team for any kernel.
uint64_t getMaxSharedTeamMemory();

private:
/// Deinitialize the device (and plugin).
void deinit();
Expand Down
12 changes: 12 additions & 0 deletions offload/include/omptarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,10 +267,22 @@ struct __tgt_target_non_contig {
extern "C" {
#endif

/// The OpenMP access group type. The criterion for grupping tasks using a
/// specific grouping property.
enum omp_access_t {
/// Groups the tasks based on the contention group to which they belong.
omp_access_cgroup = 0,
/// Groups the tasks based on the parallel region to which they bind.
omp_access_pteam = 1,
};

void ompx_dump_mapping_tables(void);
int omp_get_num_devices(void);
int omp_get_device_num(void);
int omp_get_initial_device(void);
size_t
omp_get_groupprivate_limit(int device_num,
omp_access_t access_group = omp_access_cgroup);
void *omp_target_alloc(size_t Size, int DeviceNum);
void omp_target_free(void *DevicePtr, int DeviceNum);
int omp_target_is_present(const void *Ptr, int DeviceNum);
Expand Down
14 changes: 14 additions & 0 deletions offload/libomptarget/OpenMP/API.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,20 @@ EXTERN int omp_get_initial_device(void) {
return HostDevice;
}

EXTERN size_t omp_get_groupprivate_limit(int DeviceNum,
omp_access_t AccessGroup) {
TIMESCOPE();
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
if (DeviceNum == omp_get_initial_device())
return 0;

auto DeviceOrErr = PM->getDevice(DeviceNum);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());

return DeviceOrErr->getMaxSharedTeamMemory();
}

EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
";size=" + std::to_string(Size));
Expand Down
6 changes: 6 additions & 0 deletions offload/libomptarget/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -371,3 +371,9 @@ bool DeviceTy::useAutoZeroCopy() {
bool DeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) {
return RTL->is_accessible_ptr(RTLDeviceID, Ptr, Size);
}

uint64_t DeviceTy::getMaxSharedTeamMemory() {
using DeviceQueryKind = llvm::omp::target::plugin::DeviceQueryKind;
return RTL->query_device_info(
RTLDeviceID, DeviceQueryKind::DEVICE_QUERY_MAX_SHARED_TEAM_MEM);
}
1 change: 1 addition & 0 deletions offload/libomptarget/exports
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ VERS1.0 {
omp_get_num_devices;
omp_get_device_num;
omp_get_initial_device;
omp_get_groupprivate_limit;
omp_target_alloc;
omp_target_free;
omp_target_is_accessible;
Expand Down
1 change: 1 addition & 0 deletions offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ typedef enum {
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6,
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7,
HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15,
HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16,
} hsa_amd_memory_pool_info_t;

typedef enum {
Expand Down
38 changes: 26 additions & 12 deletions offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,6 @@ struct AMDGPUMemoryPoolTy {

if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
return Err;

return Plugin::success();
}

Expand Down Expand Up @@ -548,6 +547,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
return Err;
}

StaticBlockMemSize = GroupSize;

// Make sure it is a kernel symbol.
if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
return Plugin::error(ErrorCode::INVALID_BINARY,
Expand All @@ -571,8 +572,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {

/// Launch the AMDGPU kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;

/// Return maximum block size for maximum occupancy
Expand Down Expand Up @@ -2186,6 +2187,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = checkIfAPU())
return Err;

// Retrieve the size of the group memory.
for (const auto *Pool : AllMemoryPools) {
if (Pool->isGroup()) {
size_t Size = 0;
if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, Size))
return Err;
MaxBlockSharedMemSize = Size;
break;
}
}

// Supports block shared memory natively.
HasNativeBlockSharedMem = true;

return Plugin::success();
}

Expand Down Expand Up @@ -3180,7 +3195,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
KernelArgsTy KernelArgs = {};
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
if (auto Err = AMDGPUKernel.launchImpl(
*this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
*this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;

Expand Down Expand Up @@ -3712,6 +3727,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {

Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
uint32_t DynBlockMemSize,
KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
Expand All @@ -3724,13 +3740,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
return Err;

// Account for user requested dynamic shared memory.
uint32_t GroupSize = getGroupSize();
if (uint32_t MaxDynCGroupMem = std::max(
KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
GroupSize += MaxDynCGroupMem;
}

uint64_t StackSize;
if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
return Err;
Expand Down Expand Up @@ -3782,9 +3791,14 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
KernelArgs.DynCGroupMem);
}

// Increase to the requested dynamic memory size for the device if needed.
DynBlockMemSize =
std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());

// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
GroupSize, StackSize, ArgsMemoryManager);
getStaticBlockMemSize() + DynBlockMemSize,
StackSize, ArgsMemoryManager);
}

Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
Expand Down
34 changes: 30 additions & 4 deletions offload/plugins-nextgen/common/include/PluginInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,10 @@ struct InfoTreeNode {
}
};

enum class DeviceQueryKind {
DEVICE_QUERY_MAX_SHARED_TEAM_MEM = 0,
};

/// Class wrapping a __tgt_device_image and its offload entry table on a
/// specific device. This class is responsible for storing and managing
/// the offload entries for an image on a device.
Expand Down Expand Up @@ -361,7 +365,7 @@ struct GenericKernelTy {
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
virtual Error launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
KernelArgsTy &KernelArgs,
uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;

Expand All @@ -371,6 +375,9 @@ struct GenericKernelTy {
/// Get the kernel name.
const char *getName() const { return Name.c_str(); }

/// Get the size of the static per-block memory consumed by the kernel.
uint32_t getStaticBlockMemSize() const { return StaticBlockMemSize; };

/// Get the kernel image.
DeviceImageTy &getImage() const {
assert(ImagePtr && "Kernel is not initialized!");
Expand All @@ -383,9 +390,10 @@ struct GenericKernelTy {
}

/// Return a device pointer to a new kernel launch environment.
Expected<KernelLaunchEnvironmentTy *>
getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
AsyncInfoWrapperTy &AsyncInfo) const;
Expected<KernelLaunchEnvironmentTy *> getKernelLaunchEnvironment(
GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
uint32_t BlockMemSize, DynCGroupMemFallbackType DynBlockMemFb,
void *DynBlockMemFbPtr, AsyncInfoWrapperTy &AsyncInfoWrapper) const;

/// Indicate whether an execution mode is valid.
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
Expand Down Expand Up @@ -485,6 +493,9 @@ struct GenericKernelTy {
/// The maximum number of threads which the kernel could leverage.
uint32_t MaxNumThreads;

/// The static memory sized per block.
uint32_t StaticBlockMemSize = 0;

/// The kernel environment, including execution flags.
KernelEnvironmentTy KernelEnvironment;

Expand Down Expand Up @@ -794,6 +805,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Get the unique identifier of the device.
const char *getDeviceUid() const { return DeviceUid.c_str(); }

/// Get the total shared memory per block that can be used in any kernel.
uint32_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; }

/// Indicate whether the device has native block shared memory.
bool hasNativeBlockSharedMem() const { return HasNativeBlockSharedMem; }

/// Set the context of the device if needed, before calling device-specific
/// functions. Plugins may implement this function as a no-op if not needed.
virtual Error setContext() = 0;
Expand Down Expand Up @@ -1251,6 +1268,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Internal representation for OMPT device (initialize & finalize)
std::atomic<bool> OmptInitialized;
#endif

/// The total per-block shared memory that a kernel may use.
uint32_t MaxBlockSharedMemSize = 0;

/// Whether the device has native block shared memory.
bool HasNativeBlockSharedMem = false;
};

/// Class implementing common functionalities of offload plugins. Each plugin
Expand Down Expand Up @@ -1484,6 +1507,9 @@ struct GenericPluginTy {
/// Prints information about the given devices supported by the plugin.
void print_device_info(int32_t DeviceId);

/// Retrieve information about the given device.
int64_t query_device_info(int32_t DeviceId, DeviceQueryKind Query);

/// Creates an event in the given plugin if supported.
int32_t create_event(int32_t DeviceId, void **EventPtr);

Expand Down
Loading
Loading