From 70ef83a06e1c267dcc063d3c68454367efaadde6 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 4 Feb 2025 15:50:59 +0100 Subject: [PATCH 01/51] define basic intrinsics --- src/KernelAbstractions.jl | 30 +++++++++++++++++----- src/intrinsics.jl | 52 +++++++++++++++++++++++++++++++++++++++ src/pocl/backend.jl | 25 ++++++------------- 3 files changed, 83 insertions(+), 24 deletions(-) create mode 100644 src/intrinsics.jl diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 9af2d9d4..8c2844fd 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -194,6 +194,10 @@ function unsafe_free! end unsafe_free!(::AbstractArray) = return +include("intrinsics.jl") +import .KernelIntrinsics +export KernelIntrinsics + ### # Kernel language # - @localmem @@ -460,13 +464,27 @@ end # Internal kernel functions ### -function __index_Local_Linear end -function __index_Group_Linear end -function __index_Global_Linear end +function __index_Local_Linear(ctx) + return KernelIntrinsics.get_local_id().x +end -function __index_Local_Cartesian end -function __index_Group_Cartesian end -function __index_Global_Cartesian end +function __index_Group_Linear(ctx) + return KernelIntrinsics.get_group_id().x +end + +function __index_Global_Linear(ctx) + return KernelIntrinsics.get_global_id().x +end + +function __index_Local_Cartesian(ctx) + return @inbounds workitems(__iterspace(ctx))[KernelIntrinsics.get_local_id().x] +end +function __index_Group_Cartesian(ctx) + return @inbounds blocks(__iterspace(ctx))[KernelIntrinsics.get_group_id().x] +end +function __index_Global_Cartesian(ctx) + return @inbounds expand(__iterspace(ctx), KernelIntrinsics.get_group_id().x, KernelIntrinsics.get_local_id().x) +end @inline __index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...)) @inline __index_Group_NTuple(ctx, I...) = Tuple(__index_Group_Cartesian(ctx, I...)) diff --git a/src/intrinsics.jl b/src/intrinsics.jl new file mode 100644 index 00000000..33ed56fe --- /dev/null +++ b/src/intrinsics.jl @@ -0,0 +1,52 @@ +module KernelIntrinsics + +""" + get_global_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Return the number of global work-items specified. + +!!! note + 1-based. +""" +function get_global_size end + +""" + get_global_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Returns the unique global work-item ID. +""" +function get_global_id end + +""" + get_local_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Return the number of local work-items specified. +""" +function get_local_size end + +""" + get_local_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Returns the unique local work-item ID. +""" +function get_local_id end + +""" + get_num_groups()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Returns the number of groups. +""" +function get_num_groups end + +""" + get_group_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Returns the unique group ID. +""" +function get_group_id end + +function localmemory end +function barrier end +function print end + +end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 87fccdb9..32c2951a 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -140,29 +140,18 @@ end ## Indexing Functions +const KI = KA.KernelIntrinsics -@device_override @inline function KA.__index_Local_Linear(ctx) - return get_local_id(1) +@device_override @inline function KI.get_local_id() + return (; x = get_local_id(1), y = get_local_id(2), z = get_local_id(3)) end -@device_override @inline function KA.__index_Group_Linear(ctx) - return get_group_id(1) +@device_override @inline function KI.get_group_id() + return (; x = get_group_id(1), y = get_group_id(2), z = get_group_id(3)) end -@device_override @inline function KA.__index_Global_Linear(ctx) - return get_global_id(1) -end - -@device_override @inline function KA.__index_Local_Cartesian(ctx) - @inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)] -end - -@device_override @inline function KA.__index_Group_Cartesian(ctx) - @inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)] -end - -@device_override @inline function KA.__index_Global_Cartesian(ctx) - return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) +@device_override @inline function KI.get_global_id() + return (; x = get_global_id(1), y = get_global_id(2), z = get_global_id(3)) end @device_override @inline function KA.__validindex(ctx) From 02f3124edc32627324c359779e05dad94a5e261d Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 16 Sep 2025 12:37:32 -0300 Subject: [PATCH 02/51] Fix docstrings --- src/intrinsics.jl | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 33ed56fe..15202fab 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -4,9 +4,6 @@ module KernelIntrinsics get_global_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} Return the number of global work-items specified. - -!!! note - 1-based. """ function get_global_size end @@ -14,6 +11,9 @@ function get_global_size end get_global_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} Returns the unique global work-item ID. + +!!! note + 1-based. """ function get_global_id end @@ -28,6 +28,9 @@ function get_local_size end get_local_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} Returns the unique local work-item ID. + +!!! note + 1-based. """ function get_local_id end @@ -42,6 +45,9 @@ function get_num_groups end get_group_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} Returns the unique group ID. + +!!! note + 1-based. """ function get_group_id end From 1103574e4c44eddd2beab10ed8a15ee7ac8fdec6 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 16 Sep 2025 13:51:00 -0300 Subject: [PATCH 03/51] localmemory --- src/KernelAbstractions.jl | 4 +++- src/pocl/backend.jl | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 8c2844fd..f5f76306 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -814,7 +814,9 @@ include("macros.jl") ### function Scratchpad end -function SharedMemory end +function SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} + KernelIntrinsics.localmemory(t, dims, id) +end function __synchronize() error("@synchronize used outside kernel or not captured") diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 32c2951a..c06d014c 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -166,7 +166,7 @@ end ## Shared and Scratch Memory -@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} +@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} ptr = POCL.emit_localmemory(T, Val(prod(Dims))) CLDeviceArray(Dims, ptr) end From 8f404522b3809917498784ba56b8980d39423f26 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:30:33 -0300 Subject: [PATCH 04/51] Move default barrier implementation to KernelIntrinsics --- src/KernelAbstractions.jl | 2 +- src/intrinsics.jl | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index f5f76306..cd3d1fc9 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -819,7 +819,7 @@ function SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, end function __synchronize() - error("@synchronize used outside kernel or not captured") + KernelIntrinsics.barrier() end @generated function __print(items...) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 15202fab..32cc125d 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -52,7 +52,9 @@ Returns the unique group ID. function get_group_id end function localmemory end -function barrier end +function barrier() + error("Group barrier used outside kernel or not captured") +end function print end end From 471dc5857a1d867e42be79e719f5fb4d02d07117 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:36:22 -0300 Subject: [PATCH 05/51] Implement remaining intrinsics for POCL --- src/pocl/backend.jl | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index c06d014c..87345897 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -154,6 +154,18 @@ end return (; x = get_global_id(1), y = get_global_id(2), z = get_global_id(3)) end +@device_override @inline function KI.get_local_size() + return (; x = get_local_size(1), y = get_local_size(2), z = get_local_size(3)) +end + +@device_override @inline function KI.get_num_groups() + return (; x = get_num_groups(1), y = get_num_groups(2), z = get_num_groups(3)) +end + +@device_override @inline function KI.get_global_size() + return (; x = get_global_size(1), y = get_global_size(2), z = get_global_size(3)) +end + @device_override @inline function KA.__validindex(ctx) if KA.__dynamic_checkbounds(ctx) I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) @@ -178,7 +190,7 @@ end ## Synchronization and Printing -@device_override @inline function KA.__synchronize() +@device_override @inline function KI.barrier() work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE) end From a8a14a1595b4501e3296b58e9ade7d05d615499b Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 18 Sep 2025 17:23:11 -0300 Subject: [PATCH 06/51] Add basic tests Modified from initial Claude code --- test/intrinsics.jl | 62 ++++++++++++++++++++++++++++++++++++++++++++++ test/testsuite.jl | 5 ++++ 2 files changed, 67 insertions(+) create mode 100644 test/intrinsics.jl diff --git a/test/intrinsics.jl b/test/intrinsics.jl new file mode 100644 index 00000000..693b2bd9 --- /dev/null +++ b/test/intrinsics.jl @@ -0,0 +1,62 @@ + +@kernel cpu = false inbounds = true unsafe_indices = true function test_intrinsics_kernel(results) + # Test all intrinsics return NamedTuples with x, y, z fields + global_size = KernelIntrinsics.get_global_size() + global_id = KernelIntrinsics.get_global_id() + local_size = KernelIntrinsics.get_local_size() + local_id = KernelIntrinsics.get_local_id() + num_groups = KernelIntrinsics.get_num_groups() + group_id = KernelIntrinsics.get_group_id() + + if UInt32(global_id.x) <= UInt32(global_size.x) + results[1, global_id.x] = global_id.x + results[2, global_id.x] = local_id.x + results[3, global_id.x] = group_id.x + results[4, global_id.x] = global_size.x + results[5, global_id.x] = local_size.x + results[6, global_id.x] = num_groups.x + end +end + + +function intrinsics_testsuite(backend, AT) + @testset "KernelIntrinsics Tests" begin + @testset "Basic intrinsics functionality" begin + + # Test with small kernel + N = 16 + results = AT(zeros(UInt32, 6, N)) + + kernel = test_intrinsics_kernel(backend(), 4, (N,)) + kernel(results, ndrange = N) + KernelAbstractions.synchronize(backend()) + + host_results = Array(results) + + # Verify results make sense + for i in 1:N + global_id_x, local_id_x, group_id_x, global_size_x, local_size_x, num_groups_x = host_results[:, i] + + # Global IDs should be 1-based and sequential + @test global_id_x == i + + # Global size should match our ndrange + @test global_size_x == N + + # Local size should be 4 (our workgroupsize) + @test local_size_x == 4 + + # Number of groups should be ceil(N/4) = 4 + @test num_groups_x == 4 + + # Group ID should be 1-based + expected_group = div(i - 1, 4) + 1 + @test group_id_x == expected_group + + # Local ID should be 1-based within group + expected_local = ((i - 1) % 4) + 1 + @test local_id_x == expected_local + end + end + end +end diff --git a/test/testsuite.jl b/test/testsuite.jl index 2418db99..3426e542 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -26,6 +26,7 @@ end include("test.jl") +include("intrinsics.jl") include("localmem.jl") include("private.jl") include("unroll.jl") @@ -47,6 +48,10 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{ specialfunctions_testsuite(backend) end + @conditional_testset "Intrinsics" skip_tests begin + intrinsics_testsuite(backend, AT) + end + @conditional_testset "Localmem" skip_tests begin localmem_testsuite(backend, AT) end From 96d9ffcd8b51278936485e25cb9d3151e0cd58a4 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 1 Oct 2025 13:24:43 -0300 Subject: [PATCH 07/51] Int32 -> Int https://github.com/JuliaGPU/KernelAbstractions.jl/pull/562/files#r1958255435 --- src/intrinsics.jl | 12 ++++++------ src/pocl/backend.jl | 12 ++++++------ test/intrinsics.jl | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 32cc125d..06011620 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -1,14 +1,14 @@ module KernelIntrinsics """ - get_global_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_global_size()::@NamedTuple{x::Int, y::Int, z::Int} Return the number of global work-items specified. """ function get_global_size end """ - get_global_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_global_id()::@NamedTuple{x::Int, y::Int, z::Int} Returns the unique global work-item ID. @@ -18,14 +18,14 @@ Returns the unique global work-item ID. function get_global_id end """ - get_local_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_local_size()::@NamedTuple{x::Int, y::Int, z::Int} Return the number of local work-items specified. """ function get_local_size end """ - get_local_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_local_id()::@NamedTuple{x::Int, y::Int, z::Int} Returns the unique local work-item ID. @@ -35,14 +35,14 @@ Returns the unique local work-item ID. function get_local_id end """ - get_num_groups()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int} Returns the number of groups. """ function get_num_groups end """ - get_group_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_group_id()::@NamedTuple{x::Int, y::Int, z::Int} Returns the unique group ID. diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 87345897..ef0aba6b 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -143,27 +143,27 @@ end const KI = KA.KernelIntrinsics @device_override @inline function KI.get_local_id() - return (; x = get_local_id(1), y = get_local_id(2), z = get_local_id(3)) + return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3))) end @device_override @inline function KI.get_group_id() - return (; x = get_group_id(1), y = get_group_id(2), z = get_group_id(3)) + return (; x = Int(get_group_id(1)), y = Int(get_group_id(2)), z = Int(get_group_id(3))) end @device_override @inline function KI.get_global_id() - return (; x = get_global_id(1), y = get_global_id(2), z = get_global_id(3)) + return (; x = Int(get_global_id(1)), y = Int(get_global_id(2)), z = Int(get_global_id(3))) end @device_override @inline function KI.get_local_size() - return (; x = get_local_size(1), y = get_local_size(2), z = get_local_size(3)) + return (; x = Int(get_local_size(1)), y = Int(get_local_size(2)), z = Int(get_local_size(3))) end @device_override @inline function KI.get_num_groups() - return (; x = get_num_groups(1), y = get_num_groups(2), z = get_num_groups(3)) + return (; x = Int(get_num_groups(1)), y = Int(get_num_groups(2)), z = Int(get_num_groups(3))) end @device_override @inline function KI.get_global_size() - return (; x = get_global_size(1), y = get_global_size(2), z = get_global_size(3)) + return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3))) end @device_override @inline function KA.__validindex(ctx) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 693b2bd9..e58b58a6 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -25,7 +25,7 @@ function intrinsics_testsuite(backend, AT) # Test with small kernel N = 16 - results = AT(zeros(UInt32, 6, N)) + results = AT(zeros(Int, 6, N)) kernel = test_intrinsics_kernel(backend(), 4, (N,)) kernel(results, ndrange = N) From 685a7d48165e838e9c863670c9ec17e3a476f9bd Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 2 Oct 2025 16:58:13 -0300 Subject: [PATCH 08/51] Format --- src/KernelAbstractions.jl | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index cd3d1fc9..138c63c5 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -814,13 +814,9 @@ include("macros.jl") ### function Scratchpad end -function SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} - KernelIntrinsics.localmemory(t, dims, id) -end +SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id) -function __synchronize() - KernelIntrinsics.barrier() -end +__synchronize() = KernelIntrinsics.barrier() @generated function __print(items...) str = "" From a54fb5d674fc2b016a25b77f3d8f647fe1242917 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 00:45:13 -0300 Subject: [PATCH 09/51] Launch interface --- src/KernelAbstractions.jl | 7 ++++--- src/intrinsics.jl | 39 ++++++++++++++++++++++++++++++++++++++- src/pocl/backend.jl | 2 +- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 138c63c5..7b626618 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -244,10 +244,10 @@ Declare storage that is local to a workgroup. """ macro localmem(T, dims) # Stay in sync with CUDAnative - id = gensym("static_shmem") + # id = gensym("static_shmem") return quote - $SharedMemory($(esc(T)), Val($(esc(dims))), Val($(QuoteNode(id)))) + $SharedMemory($(esc(T)), Val($(esc(dims))))#, Val($(QuoteNode(id)))) end end @@ -814,7 +814,8 @@ include("macros.jl") ### function Scratchpad end -SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id) +# SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id) +SharedMemory(t::Type{T}, dims::Val{Dims}) where {T, Dims} = KernelIntrinsics.localmemory(t, dims) __synchronize() = KernelIntrinsics.barrier() diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 06011620..77bbf2a3 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -51,10 +51,47 @@ Returns the unique group ID. """ function get_group_id end -function localmemory end +""" + localmemory(T, dims) + +Declare memory that is local to a workgroup. + +!!! note + Backend implementations **must** implement: + ``` + localmemory(T::DataType, ::Val{Dims}) where {T, Dims} + ``` + As well as the on-device functionality. +""" +localmemory(::Type{T}, dims) where T = localmemory(T, Val(dims)) +# @inline localmemory(::Type{T}, dims::Val{Dims}) where {T, Dims} = localmemory(T, dims, Val(gensym("static_shmem"))) + function barrier() error("Group barrier used outside kernel or not captured") end function print end + +""" + KIKernel{Backend, BKern} + +KIKernel closure struct that is used to represent the backend +kernel on the host. + +!!! note + Backend implementations **must** implement: + ``` + KI.KIKernel(::NewBackend, f, args...; kwargs...) + (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing) + ``` + As well as the on-device functionality. +""" +struct KIKernel{Backend, BKern} + backend::Backend + kern::BKern +end + +function kernel_max_work_group_size end +function max_work_group_size end +function multiprocessor_count end end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index ef0aba6b..4a50cc7e 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -178,7 +178,7 @@ end ## Shared and Scratch Memory -@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} +@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims} ptr = POCL.emit_localmemory(T, Val(prod(Dims))) CLDeviceArray(Dims, ptr) end From 5a6cd9bf398b1f3b6f2bf9a95d0e06288f26e8ed Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 10:24:19 -0300 Subject: [PATCH 10/51] Docs --- src/intrinsics.jl | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 77bbf2a3..9a271fa2 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -66,9 +66,28 @@ Declare memory that is local to a workgroup. localmemory(::Type{T}, dims) where T = localmemory(T, Val(dims)) # @inline localmemory(::Type{T}, dims::Val{Dims}) where {T, Dims} = localmemory(T, dims, Val(gensym("static_shmem"))) +""" + barrier() + +After a `barrier()` call, all read and writes to global and local memory +from each thread in the workgroup are visible in from all other threads in the +workgroup. + +!!! note + `barrier()` must be encountered by all workitems of a work-group executing the kernel or by none at all. + +!!! note + Backend implementations **must** implement: + ``` + @device_override barrier() + ``` + As well as the on-device functionality. +""" function barrier() error("Group barrier used outside kernel or not captured") end + +# TODO function print end From faa52139ac062e077e52c54c1144c1963d03bf64 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 10:27:40 -0300 Subject: [PATCH 11/51] New stuff docs --- src/intrinsics.jl | 46 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 9a271fa2..efbe93f0 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -110,7 +110,51 @@ struct KIKernel{Backend, BKern} kern::BKern end +""" + kernel_max_work_group_size(backend, kern; [max_work_items::Int])::Int + +The maximum workgroup size limit for a kernel as reported by the backend. +This function should always be used to determine the workgroup size before +launching a kernel. + +!!! note + Backend implementations **must** implement: + ``` + kernel_max_work_group_size(backend::NewBackend, kern::KIKernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int + ``` + As well as the on-device functionality. +""" function kernel_max_work_group_size end + +""" + max_work_group_size(backend, kern; [max_work_items::Int])::Int + +The maximum workgroup size limit for a kernel as reported by the backend. +This function represents a theoretical maximum; `kernel_max_work_group_size` +should be used before launching a kernel as some backends may error if +kernel launch with too big a workgroup is attempted. + +!!! note + Backend implementations **must** implement: + ``` + max_work_group_size(backend::NewBackend)::Int + ``` + As well as the on-device functionality. +""" function max_work_group_size end -function multiprocessor_count end + +""" + multiprocessor_count(backend::NewBackend)::Int + +The multiprocessor count for the current device used by `backend`. +Used for certain algorithm optimizations. + +!!! note + Backend implementations **may** implement: + ``` + multiprocessor_count(backend::NewBackend)::Int + ``` + As well as the on-device functionality. +""" +multiprocessor_count(::Backend) = 0 end From 6baa445253f311dba4c7b3350e344bec20f052fd Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 10:36:26 -0300 Subject: [PATCH 12/51] Fix --- src/intrinsics.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index efbe93f0..5492d8f6 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -105,9 +105,9 @@ kernel on the host. ``` As well as the on-device functionality. """ -struct KIKernel{Backend, BKern} - backend::Backend - kern::BKern +struct KIKernel{B, Kern} + backend::B + kern::Kern end """ @@ -156,5 +156,5 @@ Used for certain algorithm optimizations. ``` As well as the on-device functionality. """ -multiprocessor_count(::Backend) = 0 +multiprocessor_count(_) = 0 end From 1953e8d0a6af642948ebe94559a0cdc9e8698931 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 11:50:03 -0300 Subject: [PATCH 13/51] Temp adaptation to test in-progress interface with CUDA.jl --- src/intrinsics.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 5492d8f6..37cbba88 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -101,7 +101,7 @@ kernel on the host. Backend implementations **must** implement: ``` KI.KIKernel(::NewBackend, f, args...; kwargs...) - (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing) + (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...) ``` As well as the on-device functionality. """ From ba52bbb5c4ba167b14451921cfd8a55738d12fa4 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 4 Nov 2025 13:21:37 -0400 Subject: [PATCH 14/51] Better launch interface --- src/KernelAbstractions.jl | 10 ++--- src/intrinsics.jl | 88 ++++++++++++++++++++++++++++++++++++++- src/pocl/backend.jl | 33 ++++++++++++++- 3 files changed, 123 insertions(+), 8 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 7b626618..275ae740 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -194,6 +194,11 @@ function unsafe_free! end unsafe_free!(::AbstractArray) = return +""" +Abstract type for all KernelAbstractions backends. +""" +abstract type Backend end + include("intrinsics.jl") import .KernelIntrinsics export KernelIntrinsics @@ -500,11 +505,6 @@ constify(arg) = adapt(ConstAdaptor(), arg) # Backend hierarchy ### -""" - -Abstract type for all KernelAbstractions backends. -""" -abstract type Backend end """ Abstract type for all GPU based KernelAbstractions backends. diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 37cbba88..01ab5ca2 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -1,5 +1,8 @@ module KernelIntrinsics +import ..KernelAbstractions: Backend +import GPUCompiler: split_kwargs, assign_args! + """ get_global_size()::@NamedTuple{x::Int, y::Int, z::Int} @@ -100,7 +103,6 @@ kernel on the host. !!! note Backend implementations **must** implement: ``` - KI.KIKernel(::NewBackend, f, args...; kwargs...) (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...) ``` As well as the on-device functionality. @@ -157,4 +159,88 @@ Used for certain algorithm optimizations. As well as the on-device functionality. """ multiprocessor_count(_) = 0 + +# TODO: docstring +# kiconvert(::NewBackend, arg) +function kiconvert end + +# TODO: docstring +# KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} +function kifunction end + +const MACRO_KWARGS = [:launch, :backend] +const COMPILER_KWARGS = [:kernel, :name, :always_inline] +const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize] + +macro kikernel(backend, ex...) + call = ex[end] + kwargs = map(ex[1:end-1]) do kwarg + if kwarg isa Symbol + :($kwarg = $kwarg) + elseif Meta.isexpr(kwarg, :(=)) + kwarg + else + throw(ArgumentError("Invalid keyword argument '$kwarg'")) + end + end + + # destructure the kernel call + Meta.isexpr(call, :call) || throw(ArgumentError("final argument to @kikern should be a function call")) + f = call.args[1] + args = call.args[2:end] + + code = quote end + vars, var_exprs = assign_args!(code, args) + + # group keyword argument + macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs = + split_kwargs(kwargs, MACRO_KWARGS, COMPILER_KWARGS, LAUNCH_KWARGS) + if !isempty(other_kwargs) + key,val = first(other_kwargs).args + throw(ArgumentError("Unsupported keyword argument '$key'")) + end + + # handle keyword arguments that influence the macro's behavior + launch = true + for kwarg in macro_kwargs + key,val = kwarg.args + if key === :launch + isa(val, Bool) || throw(ArgumentError("`launch` keyword argument to @kikern should be a Bool")) + launch = val::Bool + else + throw(ArgumentError("Unsupported keyword argument '$key'")) + end + end + if !launch && !isempty(call_kwargs) + error("@kikern with launch=false does not support launch-time keyword arguments; use them when calling the kernel") + end + + # FIXME: macro hygiene wrt. escaping kwarg values (this broke with 1.5) + # we esc() the whole thing now, necessitating gensyms... + @gensym f_var kernel_f kernel_args kernel_tt kernel + + # convert the arguments, call the compiler and launch the kernel + # while keeping the original arguments alive + push!(code.args, + quote + $f_var = $f + GC.@preserve $(vars...) $f_var begin + $kernel_f = $kiconvert($backend, $f_var) + $kernel_args = map(x -> $kiconvert($backend, x), ($(var_exprs...),)) + $kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...} + $kernel = $kifunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...)) + if $launch + $kernel($(var_exprs...); $(call_kwargs...)) + end + $kernel + end + end) + + return esc(quote + let + $code + end + end) +end + end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 4a50cc7e..b53bed69 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -2,7 +2,7 @@ module POCLKernels using ..POCL using ..POCL: @device_override, cl, method_table -using ..POCL: device +using ..POCL: device, clconvert, clfunction import KernelAbstractions as KA @@ -138,9 +138,38 @@ function (obj::KA.Kernel{POCLBackend})(args...; ndrange = nothing, workgroupsize return nothing end +const KI = KA.KernelIntrinsics + +KI.kiconvert(::POCLBackend, arg) = clconvert(arg) + +function KI.kifunction(::POCLBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + kern = clfunction(f, tt; name, kwargs...) + KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) +end + +function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...) + local_size = isnothing(workgroupsize) ? 1 : workgroupsize + global_size = if isnothing(numworkgroups) + 1 + else + numworkgroups*local_size + end + + obj.kern(args...; local_size, global_size) +end + + +function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int=typemax(Int))::Int + 4096 +end +function KI.max_work_group_size(::POCLBackend)::Int + 4096 +end +function KI.multiprocessor_count(::POCLBackend)::Int + 1 +end ## Indexing Functions -const KI = KA.KernelIntrinsics @device_override @inline function KI.get_local_id() return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3))) From 1fb74a005cb0aec6f6a4ca9b4de180db0180bd65 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 4 Nov 2025 16:35:58 -0400 Subject: [PATCH 15/51] Only keep common compiler kwarg --- src/intrinsics.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 01ab5ca2..0ed4eb87 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -169,7 +169,7 @@ function kiconvert end function kifunction end const MACRO_KWARGS = [:launch, :backend] -const COMPILER_KWARGS = [:kernel, :name, :always_inline] +const COMPILER_KWARGS = [:name] const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize] macro kikernel(backend, ex...) From 36c67ce660ec85900a02bad3fc103f3152e8e497 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 12:24:34 -0400 Subject: [PATCH 16/51] Fixup tests --- test/intrinsics.jl | 26 +++++++++++++++++--------- test/testsuite.jl | 1 + 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index e58b58a6..537b29fb 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -1,12 +1,12 @@ -@kernel cpu = false inbounds = true unsafe_indices = true function test_intrinsics_kernel(results) +function test_intrinsics_kernel(results) # Test all intrinsics return NamedTuples with x, y, z fields - global_size = KernelIntrinsics.get_global_size() - global_id = KernelIntrinsics.get_global_id() - local_size = KernelIntrinsics.get_local_size() - local_id = KernelIntrinsics.get_local_id() - num_groups = KernelIntrinsics.get_num_groups() - group_id = KernelIntrinsics.get_group_id() + global_size = KI.get_global_size() + global_id = KI.get_global_id() + local_size = KI.get_local_size() + local_id = KI.get_local_id() + num_groups = KI.get_num_groups() + group_id = KI.get_group_id() if UInt32(global_id.x) <= UInt32(global_size.x) results[1, global_id.x] = global_id.x @@ -16,6 +16,7 @@ results[5, global_id.x] = local_size.x results[6, global_id.x] = num_groups.x end + return end @@ -23,12 +24,19 @@ function intrinsics_testsuite(backend, AT) @testset "KernelIntrinsics Tests" begin @testset "Basic intrinsics functionality" begin + @test KI.max_work_group_size(backend()) isa Int + @test KI.multiprocessor_count(backend()) isa Int + # Test with small kernel N = 16 results = AT(zeros(Int, 6, N)) - kernel = test_intrinsics_kernel(backend(), 4, (N,)) - kernel(results, ndrange = N) + kernel = KI.@kikernel backend() test_intrinsics_kernel(results) + + @test KI.kernel_max_work_group_size(backend(), kernel) isa Int + @test KI.kernel_max_work_group_size(backend(), kernel; max_work_items=1) == 1 + + kernel(results, workgroupsize = 4, numworkgroups = 4) KernelAbstractions.synchronize(backend()) host_results = Array(results) diff --git a/test/testsuite.jl b/test/testsuite.jl index 3426e542..31b801b3 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -1,6 +1,7 @@ module Testsuite using ..KernelAbstractions +import ..KernelAbstractions.KernelIntrinsics as KI using ..Test # We can't add test-dependencies withouth breaking backend packages From 0636279c5b0e8d5756c192aa8a0da741c3144866 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 12:25:48 -0400 Subject: [PATCH 17/51] No `backend` in macro kwargs --- src/intrinsics.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 0ed4eb87..cf3093a9 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -168,7 +168,7 @@ function kiconvert end # KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} function kifunction end -const MACRO_KWARGS = [:launch, :backend] +const MACRO_KWARGS = [:launch] const COMPILER_KWARGS = [:name] const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize] From 48c6b8b61458f809318b65617bdc8bf63cca8be2 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 12:25:51 -0400 Subject: [PATCH 18/51] Tests --- src/intrinsics.jl | 52 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index cf3093a9..dfa80bfb 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -160,18 +160,62 @@ Used for certain algorithm optimizations. """ multiprocessor_count(_) = 0 -# TODO: docstring -# kiconvert(::NewBackend, arg) +""" + kiconvert(::NewBackend, arg) + +This function is called for every argument to be passed to a kernel, allowing it to be +converted to a GPU-friendly format. + +!!! note + Backend implementations **must** implement: + ``` + kiconvert(::NewBackend, arg) + ``` +""" function kiconvert end -# TODO: docstring -# KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} +""" + KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + +Low-level interface to compile a function invocation for the currently-active GPU, returning +a callable kernel object. For a higher-level interface, use [`@kikernel`](@ref). + +Currently, only `kifunction` only supports the `name` keyword argument as it is the only one +by all backends. + +Keyword arguments: +- `name`: override the name that the kernel will have in the generated code + +!!! note + Backend implementations **must** implement: + ``` + kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + ``` +""" function kifunction end const MACRO_KWARGS = [:launch] const COMPILER_KWARGS = [:name] const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize] +""" + @kikernel backend workgroupsize=... numworkgroups=... [kwargs...] func(args...) + +High-level interface for executing code on a GPU. + +The `@kikernel` macro should prefix a call, with `func` a callable function or object that +should return nothing. It will be compiled to a function native to the specified `backend` +upon first use, and to a certain extent arguments will be converted and managed automatically +using `kiconvert`. Finally, if `launch=true`, the newly created callable kernel object is +called and launched according to the specified `backend`. + +There are a few keyword arguments that influence the behavior of `@kikernel`: + +- `launch`: whether to launch this kernel, defaults to `true`. If `false`, the returned + kernel object should be launched by calling it and passing arguments again. +- `name`: the name of the kernel in the generated code. Defaults to an automatically- + generated name. +""" macro kikernel(backend, ex...) call = ex[end] kwargs = map(ex[1:end-1]) do kwarg From cc240c67f9d4866e304cde7964f6dcfc65c147b1 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 17:33:47 -0400 Subject: [PATCH 19/51] Fixes --- src/KernelAbstractions.jl | 12 ++++++------ src/pocl/backend.jl | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 275ae740..67d53206 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -469,25 +469,25 @@ end # Internal kernel functions ### -function __index_Local_Linear(ctx) +@inline function __index_Local_Linear(ctx) return KernelIntrinsics.get_local_id().x end -function __index_Group_Linear(ctx) +@inline function __index_Group_Linear(ctx) return KernelIntrinsics.get_group_id().x end -function __index_Global_Linear(ctx) +@inline function __index_Global_Linear(ctx) return KernelIntrinsics.get_global_id().x end -function __index_Local_Cartesian(ctx) +@inline function __index_Local_Cartesian(ctx) return @inbounds workitems(__iterspace(ctx))[KernelIntrinsics.get_local_id().x] end -function __index_Group_Cartesian(ctx) +@inline function __index_Group_Cartesian(ctx) return @inbounds blocks(__iterspace(ctx))[KernelIntrinsics.get_group_id().x] end -function __index_Global_Cartesian(ctx) +@inline function __index_Global_Cartesian(ctx) return @inbounds expand(__iterspace(ctx), KernelIntrinsics.get_group_id().x, KernelIntrinsics.get_local_id().x) end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index b53bed69..ac111687 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -147,7 +147,7 @@ function KI.kifunction(::POCLBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) end -function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...) +function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing) local_size = isnothing(workgroupsize) ? 1 : workgroupsize global_size = if isnothing(numworkgroups) 1 From f41d1e015cc857c3e5b87b9c7c01d16c50e8dcc2 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 18:34:35 -0400 Subject: [PATCH 20/51] More fix --- src/pocl/backend.jl | 9 +++++---- test/intrinsics.jl | 3 +-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index ac111687..46fd32df 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -56,7 +56,7 @@ KA.functional(::POCLBackend) = true KA.pagelock!(::POCLBackend, x) = nothing KA.get_backend(::Array) = POCLBackend() -KA.synchronize(::POCLBackend) = nothing +KA.synchronize(::POCLBackend) = cl.finish(cl.queue()) KA.supports_float64(::POCLBackend) = true KA.supports_unified(::POCLBackend) = true @@ -160,13 +160,14 @@ end function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int=typemax(Int))::Int - 4096 + wginfo = cl.work_group_info(kikern.kern.fun, device()) + Int(min(wginfo.size, max_work_items)) end function KI.max_work_group_size(::POCLBackend)::Int - 4096 + Int(device().max_work_group_size) end function KI.multiprocessor_count(::POCLBackend)::Int - 1 + Int(device().max_compute_units) end ## Indexing Functions diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 537b29fb..cd75ad3f 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -30,8 +30,7 @@ function intrinsics_testsuite(backend, AT) # Test with small kernel N = 16 results = AT(zeros(Int, 6, N)) - - kernel = KI.@kikernel backend() test_intrinsics_kernel(results) + kernel = KI.@kikernel backend() launch=false test_intrinsics_kernel(results) @test KI.kernel_max_work_group_size(backend(), kernel) isa Int @test KI.kernel_max_work_group_size(backend(), kernel; max_work_items=1) == 1 From 8e97be1aa76ed6066024b0b4849201466c1bfeac Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 18:51:40 -0400 Subject: [PATCH 21/51] Print --- src/KernelAbstractions.jl | 18 +----------------- src/intrinsics.jl | 19 ++++++++++++++++++- src/pocl/backend.jl | 2 +- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 67d53206..c53f9989 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -819,23 +819,7 @@ SharedMemory(t::Type{T}, dims::Val{Dims}) where {T, Dims} = KernelIntrinsics.loc __synchronize() = KernelIntrinsics.barrier() -@generated function __print(items...) - str = "" - args = [] - - for i in 1:length(items) - item = :(items[$i]) - T = items[i] - if T <: Val - item = QuoteNode(T.parameters[1]) - end - push!(args, item) - end - - return quote - print($(args...)) - end -end +__print(args...) = KernelIntrinsics._print(args...) # Utils __size(args::Tuple) = Tuple{args...} diff --git a/src/intrinsics.jl b/src/intrinsics.jl index dfa80bfb..665b5903 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -91,7 +91,24 @@ function barrier() end # TODO -function print end +@generated function _print(items...) + str = "" + args = [] + + for i in 1:length(items) + item = :(items[$i]) + T = items[i] + if T <: Val + item = QuoteNode(T.parameters[1]) + end + push!(args, item) + end + + return quote + print($(args...)) + end +end + """ diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 46fd32df..43b8d7b4 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -224,7 +224,7 @@ end work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE) end -@device_override @inline function KA.__print(args...) +@device_override @inline function KI._print(args...) POCL._print(args...) end From bf37826bfc08b5aa53160b4b276c055f1f42cceb Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 18:54:48 -0400 Subject: [PATCH 22/51] tweak --- src/pocl/backend.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 43b8d7b4..60e37bbc 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -5,6 +5,7 @@ using ..POCL: @device_override, cl, method_table using ..POCL: device, clconvert, clfunction import KernelAbstractions as KA +import KA.KernelIntrinsics as KI import StaticArrays @@ -138,8 +139,6 @@ function (obj::KA.Kernel{POCLBackend})(args...; ndrange = nothing, workgroupsize return nothing end -const KI = KA.KernelIntrinsics - KI.kiconvert(::POCLBackend, arg) = clconvert(arg) function KI.kifunction(::POCLBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} From cae65fe412b3ec65b5f2ada87878505c640ff88c Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:09:13 -0400 Subject: [PATCH 23/51] Fix --- src/pocl/backend.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 60e37bbc..e711600c 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -5,7 +5,7 @@ using ..POCL: @device_override, cl, method_table using ..POCL: device, clconvert, clfunction import KernelAbstractions as KA -import KA.KernelIntrinsics as KI +import KernekAbstractions.KernelIntrinsics as KI import StaticArrays From 88616587fdb69a427189dc184898c631118db922 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:14:43 -0400 Subject: [PATCH 24/51] Ugh --- src/pocl/backend.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index e711600c..b09be712 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -5,7 +5,7 @@ using ..POCL: @device_override, cl, method_table using ..POCL: device, clconvert, clfunction import KernelAbstractions as KA -import KernekAbstractions.KernelIntrinsics as KI +import KernelAbstractions.KernelIntrinsics as KI import StaticArrays From 86c18fccb39338912c9c4de6b42f0a891cae5a34 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:28:24 -0400 Subject: [PATCH 25/51] Format --- src/KernelAbstractions.jl | 5 +---- src/intrinsics.jl | 26 ++++++++++++++------------ src/pocl/backend.jl | 16 ++++++++-------- test/intrinsics.jl | 5 +++-- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index c53f9989..feaafa5b 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -248,11 +248,8 @@ end Declare storage that is local to a workgroup. """ macro localmem(T, dims) - # Stay in sync with CUDAnative - # id = gensym("static_shmem") - return quote - $SharedMemory($(esc(T)), Val($(esc(dims))))#, Val($(QuoteNode(id)))) + $SharedMemory($(esc(T)), Val($(esc(dims)))) end end diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 665b5903..6db136ac 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -66,8 +66,7 @@ Declare memory that is local to a workgroup. ``` As well as the on-device functionality. """ -localmemory(::Type{T}, dims) where T = localmemory(T, Val(dims)) -# @inline localmemory(::Type{T}, dims::Val{Dims}) where {T, Dims} = localmemory(T, dims, Val(gensym("static_shmem"))) +localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims)) """ barrier() @@ -110,7 +109,6 @@ end end - """ KIKernel{Backend, BKern} @@ -235,7 +233,7 @@ There are a few keyword arguments that influence the behavior of `@kikernel`: """ macro kikernel(backend, ex...) call = ex[end] - kwargs = map(ex[1:end-1]) do kwarg + kwargs = map(ex[1:(end - 1)]) do kwarg if kwarg isa Symbol :($kwarg = $kwarg) elseif Meta.isexpr(kwarg, :(=)) @@ -257,14 +255,14 @@ macro kikernel(backend, ex...) macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs = split_kwargs(kwargs, MACRO_KWARGS, COMPILER_KWARGS, LAUNCH_KWARGS) if !isempty(other_kwargs) - key,val = first(other_kwargs).args + key, val = first(other_kwargs).args throw(ArgumentError("Unsupported keyword argument '$key'")) end # handle keyword arguments that influence the macro's behavior launch = true for kwarg in macro_kwargs - key,val = kwarg.args + key, val = kwarg.args if key === :launch isa(val, Bool) || throw(ArgumentError("`launch` keyword argument to @kikern should be a Bool")) launch = val::Bool @@ -282,7 +280,8 @@ macro kikernel(backend, ex...) # convert the arguments, call the compiler and launch the kernel # while keeping the original arguments alive - push!(code.args, + push!( + code.args, quote $f_var = $f GC.@preserve $(vars...) $f_var begin @@ -295,13 +294,16 @@ macro kikernel(backend, ex...) end $kernel end - end) + end + ) - return esc(quote - let - $code + return esc( + quote + let + $code + end end - end) + ) end end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index b09be712..c38188b3 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -141,32 +141,32 @@ end KI.kiconvert(::POCLBackend, arg) = clconvert(arg) -function KI.kifunction(::POCLBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} +function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name=nothing, kwargs...) where {F, TT} kern = clfunction(f, tt; name, kwargs...) KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) end -function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing) +function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing) local_size = isnothing(workgroupsize) ? 1 : workgroupsize global_size = if isnothing(numworkgroups) 1 else - numworkgroups*local_size + numworkgroups * local_size end - obj.kern(args...; local_size, global_size) + return obj.kern(args...; local_size, global_size) end -function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int=typemax(Int))::Int +function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int wginfo = cl.work_group_info(kikern.kern.fun, device()) - Int(min(wginfo.size, max_work_items)) + return Int(min(wginfo.size, max_work_items)) end function KI.max_work_group_size(::POCLBackend)::Int - Int(device().max_work_group_size) + return Int(device().max_work_group_size) end function KI.multiprocessor_count(::POCLBackend)::Int - Int(device().max_compute_units) + return Int(device().max_compute_units) end ## Indexing Functions diff --git a/test/intrinsics.jl b/test/intrinsics.jl index cd75ad3f..b3b95d50 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -30,10 +30,10 @@ function intrinsics_testsuite(backend, AT) # Test with small kernel N = 16 results = AT(zeros(Int, 6, N)) - kernel = KI.@kikernel backend() launch=false test_intrinsics_kernel(results) + kernel = KI.@kikernel backend() launch = false test_intrinsics_kernel(results) @test KI.kernel_max_work_group_size(backend(), kernel) isa Int - @test KI.kernel_max_work_group_size(backend(), kernel; max_work_items=1) == 1 + @test KI.kernel_max_work_group_size(backend(), kernel; max_work_items = 1) == 1 kernel(results, workgroupsize = 4, numworkgroups = 4) KernelAbstractions.synchronize(backend()) @@ -66,4 +66,5 @@ function intrinsics_testsuite(backend, AT) end end end + return nothing end From 0a6f1720a946a37e7da78d4e46b1f3c032ae16f9 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:30:02 -0400 Subject: [PATCH 26/51] Format --- src/pocl/backend.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index c38188b3..3aac42a5 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -141,9 +141,9 @@ end KI.kiconvert(::POCLBackend, arg) = clconvert(arg) -function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name=nothing, kwargs...) where {F, TT} +function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT} kern = clfunction(f, tt; name, kwargs...) - KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) + return KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) end function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing) From 2371f884bfe527064c60d5683317dbfa0e585423 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:32:24 -0400 Subject: [PATCH 27/51] drbh --- test/intrinsics.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index b3b95d50..6fbf55c0 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -1,4 +1,3 @@ - function test_intrinsics_kernel(results) # Test all intrinsics return NamedTuples with x, y, z fields global_size = KI.get_global_size() @@ -19,7 +18,6 @@ function test_intrinsics_kernel(results) return end - function intrinsics_testsuite(backend, AT) @testset "KernelIntrinsics Tests" begin @testset "Basic intrinsics functionality" begin From 0a45913f5ca0d69136bae016d1fb623dcdddbb00 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 20:29:39 -0400 Subject: [PATCH 28/51] `_print` docstring --- src/intrinsics.jl | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 6db136ac..4eddf4a3 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -89,7 +89,20 @@ function barrier() error("Group barrier used outside kernel or not captured") end -# TODO +""" + _print(items...) + + Overloaded by backends to enable `KernelAbstractions.@print` + functionality. + +!!! note + Backend implementations **must** implement: + ``` + _print(items...) + ``` + As well as the on-device functionality, + or define it to return `nothing` +""" @generated function _print(items...) str = "" args = [] From ff3b0777b0abd799fcd8c6105d221d789b03599d Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 10:36:56 -0400 Subject: [PATCH 29/51] Test all launch size options --- test/intrinsics.jl | 50 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 6fbf55c0..79996850 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -20,6 +20,56 @@ end function intrinsics_testsuite(backend, AT) @testset "KernelIntrinsics Tests" begin + @testset "Launch parameters" begin + # 1d + function launch_kernel1d(arr) + i, _, _ = KI.get_local_id() + gi, _, _ = KI.get_group_id() + ngi, _, _ = KI.get_num_groups() + + arr[(gi - 1) * ngi + i] = 1f0 + return + end + arr1d = AT(zeros(Float32, 4)) + KI.@kikernel backend() numworkgroups = 2 workgroupsize = 2 launch_kernel1d(arr1d) + KernelAbstractions.synchronize(backend()) + @test all(Array(arr1d) .== 1) + + # 1d tuple + arr1dt = AT(zeros(Float32, 4)) + KI.@kikernel backend() numworkgroups = (2,) workgroupsize = (2,) launch_kernel1d(arr1dt) + KernelAbstractions.synchronize(backend()) + @test all(Array(arr1dt) .== 1) + + # 2d + function launch_kernel2d(arr) + i, j, _ = KI.get_local_id() + gi, gj, _ = KI.get_group_id() + ngi, ngj, _ = KI.get_num_groups() + + arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j] = 1f0 + return + end + arr2d = AT(zeros(Float32, 4, 4)) + KI.@kikernel backend() numworkgroups = (2, 2) workgroupsize = (2, 2) launch_kernel2d(arr2d) + KernelAbstractions.synchronize(backend()) + @test all(Array(arr2d) .== 1) + + # 3d + function launch_kernel3d(arr) + i, j, k = KI.get_local_id() + gi, gj, gk = KI.get_group_id() + ngi, ngj, ngk = KI.get_num_groups() + + arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j, (gk - 1) * ngk + k] = 1f0 + return + end + arr3d = AT(zeros(Float32, 4, 4, 4)) + KI.@kikernel backend() numworkgroups = (2, 2, 2) workgroupsize = (2, 2, 2) launch_kernel3d(arr3d) + KernelAbstractions.synchronize(backend()) + @test all(Array(arr3d) .== 1) + end + @testset "Basic intrinsics functionality" begin @test KI.max_work_group_size(backend()) isa Int From 0732ae094ebbc2724e7f2c87b4279dd693308249 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 10:39:45 -0400 Subject: [PATCH 30/51] Fix backend --- src/pocl/backend.jl | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 3aac42a5..e66720ed 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -146,17 +146,23 @@ function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kw return KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) end -function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing) - local_size = isnothing(workgroupsize) ? 1 : workgroupsize - global_size = if isnothing(numworkgroups) - 1 - else - numworkgroups * local_size +function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing) + local_size = StaticArrays.MVector{3}((1,1,1)) + if !isnothing(workgroupsize) + for (i, val) in enumerate(workgroupsize) + local_size[i] = val + end end - return obj.kern(args...; local_size, global_size) -end + global_size = StaticArrays.MVector{3}((1,1,1)) + if !isnothing(numworkgroups) + for (i, val) in enumerate(numworkgroups) + global_size[i] = val * local_size[i] + end + end + obj.kern(args...; local_size, global_size) +end function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int wginfo = cl.work_group_info(kikern.kern.fun, device()) From 7d6be1bcf9bcf70257ffb97ba7005f9644f449bf Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 10:54:31 -0400 Subject: [PATCH 31/51] Format --- src/pocl/backend.jl | 8 ++++---- test/intrinsics.jl | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index e66720ed..41bbea58 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -146,22 +146,22 @@ function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kw return KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) end -function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing) - local_size = StaticArrays.MVector{3}((1,1,1)) +function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing) + local_size = StaticArrays.MVector{3}((1, 1, 1)) if !isnothing(workgroupsize) for (i, val) in enumerate(workgroupsize) local_size[i] = val end end - global_size = StaticArrays.MVector{3}((1,1,1)) + global_size = StaticArrays.MVector{3}((1, 1, 1)) if !isnothing(numworkgroups) for (i, val) in enumerate(numworkgroups) global_size[i] = val * local_size[i] end end - obj.kern(args...; local_size, global_size) + return obj.kern(args...; local_size, global_size) end function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 79996850..3ec26c56 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -27,7 +27,7 @@ function intrinsics_testsuite(backend, AT) gi, _, _ = KI.get_group_id() ngi, _, _ = KI.get_num_groups() - arr[(gi - 1) * ngi + i] = 1f0 + arr[(gi - 1) * ngi + i] = 1.0f0 return end arr1d = AT(zeros(Float32, 4)) @@ -47,7 +47,7 @@ function intrinsics_testsuite(backend, AT) gi, gj, _ = KI.get_group_id() ngi, ngj, _ = KI.get_num_groups() - arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j] = 1f0 + arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j] = 1.0f0 return end arr2d = AT(zeros(Float32, 4, 4)) @@ -61,7 +61,7 @@ function intrinsics_testsuite(backend, AT) gi, gj, gk = KI.get_group_id() ngi, ngj, ngk = KI.get_num_groups() - arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j, (gk - 1) * ngk + k] = 1f0 + arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j, (gk - 1) * ngk + k] = 1.0f0 return end arr3d = AT(zeros(Float32, 4, 4, 4)) From 04f21ec918a8a9bb9ce2482bdd4d78d7458d85ab Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 12:18:43 -0400 Subject: [PATCH 32/51] More consistent docstrings --- src/intrinsics.jl | 47 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 4eddf4a3..fb258578 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -7,6 +7,12 @@ import GPUCompiler: split_kwargs, assign_args! get_global_size()::@NamedTuple{x::Int, y::Int, z::Int} Return the number of global work-items specified. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_global_size()::@NamedTuple{x::Int, y::Int, z::Int} + ``` """ function get_global_size end @@ -17,6 +23,12 @@ Returns the unique global work-item ID. !!! note 1-based. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_global_id()::@NamedTuple{x::Int, y::Int, z::Int} + ``` """ function get_global_id end @@ -24,6 +36,12 @@ function get_global_id end get_local_size()::@NamedTuple{x::Int, y::Int, z::Int} Return the number of local work-items specified. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_local_size()::@NamedTuple{x::Int, y::Int, z::Int} + ``` """ function get_local_size end @@ -34,6 +52,12 @@ Returns the unique local work-item ID. !!! note 1-based. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_local_id()::@NamedTuple{x::Int, y::Int, z::Int} + ``` """ function get_local_id end @@ -41,6 +65,12 @@ function get_local_id end get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int} Returns the number of groups. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int} + ``` """ function get_num_groups end @@ -51,6 +81,12 @@ Returns the unique group ID. !!! note 1-based. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_group_id()::@NamedTuple{x::Int, y::Int, z::Int} + ``` """ function get_group_id end @@ -62,7 +98,7 @@ Declare memory that is local to a workgroup. !!! note Backend implementations **must** implement: ``` - localmemory(T::DataType, ::Val{Dims}) where {T, Dims} + @device_override localmemory(T::DataType, ::Val{Dims}) where {T, Dims} ``` As well as the on-device functionality. """ @@ -83,14 +119,13 @@ workgroup. ``` @device_override barrier() ``` - As well as the on-device functionality. """ function barrier() error("Group barrier used outside kernel or not captured") end """ - _print(items...) + _print(args...) Overloaded by backends to enable `KernelAbstractions.@print` functionality. @@ -98,10 +133,10 @@ end !!! note Backend implementations **must** implement: ``` - _print(items...) + @device_override _print(args...) ``` - As well as the on-device functionality, - or define it to return `nothing` + If the backend does not support printing, + define it to return `nothing`. """ @generated function _print(items...) str = "" From d068b80405a483332a7afcf856d35ec5fde7263a Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 14:05:19 -0400 Subject: [PATCH 33/51] Qualify `map` --- src/intrinsics.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index fb258578..a70370b0 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -334,8 +334,8 @@ macro kikernel(backend, ex...) $f_var = $f GC.@preserve $(vars...) $f_var begin $kernel_f = $kiconvert($backend, $f_var) - $kernel_args = map(x -> $kiconvert($backend, x), ($(var_exprs...),)) - $kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...} + $kernel_args = Base.map(x -> $kiconvert($backend, x), ($(var_exprs...),)) + $kernel_tt = Tuple{Base.map(Core.Typeof, $kernel_args)...} $kernel = $kifunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...)) if $launch $kernel($(var_exprs...); $(call_kwargs...)) From 2fe4502e8418058f2f46a7d39f379204e0ddad68 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 18:44:28 -0400 Subject: [PATCH 34/51] `KIKernel` -> `Kernel` --- src/intrinsics.jl | 10 +++++----- src/pocl/backend.jl | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index a70370b0..5f48d338 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -158,19 +158,19 @@ end """ - KIKernel{Backend, BKern} + Kernel{Backend, BKern} -KIKernel closure struct that is used to represent the backend +Kernel closure struct that is used to represent the backend kernel on the host. !!! note Backend implementations **must** implement: ``` - (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...) + (kernel::Kernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...) ``` As well as the on-device functionality. """ -struct KIKernel{B, Kern} +struct Kernel{B, Kern} backend::B kern::Kern end @@ -185,7 +185,7 @@ launching a kernel. !!! note Backend implementations **must** implement: ``` - kernel_max_work_group_size(backend::NewBackend, kern::KIKernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int + kernel_max_work_group_size(backend::NewBackend, kern::Kernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int ``` As well as the on-device functionality. """ diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 41bbea58..7383a66f 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -143,10 +143,10 @@ KI.kiconvert(::POCLBackend, arg) = clconvert(arg) function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT} kern = clfunction(f, tt; name, kwargs...) - return KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) + return KI.Kernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) end -function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing) +function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing) local_size = StaticArrays.MVector{3}((1, 1, 1)) if !isnothing(workgroupsize) for (i, val) in enumerate(workgroupsize) @@ -164,7 +164,7 @@ function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workg return obj.kern(args...; local_size, global_size) end -function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int +function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int wginfo = cl.work_group_info(kikern.kern.fun, device()) return Int(min(wginfo.size, max_work_items)) end From aaf20a7942d517aee62d561532675123f7b87ec3 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 18:46:51 -0400 Subject: [PATCH 35/51] Remove redundant parameter --- src/intrinsics.jl | 4 ++-- src/pocl/backend.jl | 2 +- test/intrinsics.jl | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 5f48d338..eda58ce4 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -176,7 +176,7 @@ struct Kernel{B, Kern} end """ - kernel_max_work_group_size(backend, kern; [max_work_items::Int])::Int + kernel_max_work_group_size(kern; [max_work_items::Int])::Int The maximum workgroup size limit for a kernel as reported by the backend. This function should always be used to determine the workgroup size before @@ -185,7 +185,7 @@ launching a kernel. !!! note Backend implementations **must** implement: ``` - kernel_max_work_group_size(backend::NewBackend, kern::Kernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int + kernel_max_work_group_size(kern::Kernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int ``` As well as the on-device functionality. """ diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 7383a66f..08e5a76e 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -164,7 +164,7 @@ function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = nothing, workgro return obj.kern(args...; local_size, global_size) end -function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int +function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int wginfo = cl.work_group_info(kikern.kern.fun, device()) return Int(min(wginfo.size, max_work_items)) end diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 3ec26c56..911e544f 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -80,8 +80,8 @@ function intrinsics_testsuite(backend, AT) results = AT(zeros(Int, 6, N)) kernel = KI.@kikernel backend() launch = false test_intrinsics_kernel(results) - @test KI.kernel_max_work_group_size(backend(), kernel) isa Int - @test KI.kernel_max_work_group_size(backend(), kernel; max_work_items = 1) == 1 + @test KI.kernel_max_work_group_size(kernel) isa Int + @test KI.kernel_max_work_group_size(kernel; max_work_items = 1) == 1 kernel(results, workgroupsize = 4, numworkgroups = 4) KernelAbstractions.synchronize(backend()) From e2941fbcb0c21a9cea78b40fb526deba07a8f8a9 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 19:13:27 -0400 Subject: [PATCH 36/51] `kiconvert` -> `argconvert` --- src/intrinsics.jl | 12 ++++++------ src/pocl/backend.jl | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index eda58ce4..ad2c8f6c 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -224,7 +224,7 @@ Used for certain algorithm optimizations. multiprocessor_count(_) = 0 """ - kiconvert(::NewBackend, arg) + argconvert(::NewBackend, arg) This function is called for every argument to be passed to a kernel, allowing it to be converted to a GPU-friendly format. @@ -232,10 +232,10 @@ converted to a GPU-friendly format. !!! note Backend implementations **must** implement: ``` - kiconvert(::NewBackend, arg) + argconvert(::NewBackend, arg) ``` """ -function kiconvert end +function argconvert end """ KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} @@ -269,7 +269,7 @@ High-level interface for executing code on a GPU. The `@kikernel` macro should prefix a call, with `func` a callable function or object that should return nothing. It will be compiled to a function native to the specified `backend` upon first use, and to a certain extent arguments will be converted and managed automatically -using `kiconvert`. Finally, if `launch=true`, the newly created callable kernel object is +using `argconvert`. Finally, if `launch=true`, the newly created callable kernel object is called and launched according to the specified `backend`. There are a few keyword arguments that influence the behavior of `@kikernel`: @@ -333,8 +333,8 @@ macro kikernel(backend, ex...) quote $f_var = $f GC.@preserve $(vars...) $f_var begin - $kernel_f = $kiconvert($backend, $f_var) - $kernel_args = Base.map(x -> $kiconvert($backend, x), ($(var_exprs...),)) + $kernel_f = $argconvert($backend, $f_var) + $kernel_args = Base.map(x -> $argconvert($backend, x), ($(var_exprs...),)) $kernel_tt = Tuple{Base.map(Core.Typeof, $kernel_args)...} $kernel = $kifunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...)) if $launch diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 08e5a76e..f5226bc7 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -139,7 +139,7 @@ function (obj::KA.Kernel{POCLBackend})(args...; ndrange = nothing, workgroupsize return nothing end -KI.kiconvert(::POCLBackend, arg) = clconvert(arg) +KI.argconvert(::POCLBackend, arg) = clconvert(arg) function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT} kern = clfunction(f, tt; name, kwargs...) From 5262c9e6fbd6f05332dfbb45037e737693a94e52 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 19:16:05 -0400 Subject: [PATCH 37/51] Remove old definition --- src/intrinsics.jl | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index ad2c8f6c..172d1de1 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -138,23 +138,8 @@ end If the backend does not support printing, define it to return `nothing`. """ -@generated function _print(items...) - str = "" - args = [] - - for i in 1:length(items) - item = :(items[$i]) - T = items[i] - if T <: Val - item = QuoteNode(T.parameters[1]) - end - push!(args, item) - end +function _print end - return quote - print($(args...)) - end -end """ From ca11482c526749b4c47ad3e23068ff8daed8c673 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 19:28:48 -0400 Subject: [PATCH 38/51] Unbreak ABI --- src/KernelAbstractions.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index feaafa5b..95e71b12 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -248,8 +248,11 @@ end Declare storage that is local to a workgroup. """ macro localmem(T, dims) + # Stay in sync with CUDAnative + id = gensym("static_shmem") + return quote - $SharedMemory($(esc(T)), Val($(esc(dims)))) + $SharedMemory($(esc(T)), Val($(esc(dims))), Val($(QuoteNode(id)))) end end @@ -811,8 +814,7 @@ include("macros.jl") ### function Scratchpad end -# SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id) -SharedMemory(t::Type{T}, dims::Val{Dims}) where {T, Dims} = KernelIntrinsics.localmemory(t, dims) +SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims) __synchronize() = KernelIntrinsics.barrier() From d6f3e66dad2ce09fdcba1b0d2256fb4fd7e8dd9c Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 7 Nov 2025 09:05:50 -0400 Subject: [PATCH 39/51] Readd old definition --- src/KernelAbstractions.jl | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 95e71b12..fb48fd7b 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -369,6 +369,25 @@ macro context() return esc(:(__ctx__)) end +# Defined to keep cpu support for `__print` +@generated function KernelIntrinsics._print(items...) + str = "" + args = [] + + for i in 1:length(items) + item = :(items[$i]) + T = items[i] + if T <: Val + item = QuoteNode(T.parameters[1]) + end + push!(args, item) + end + + return quote + print($(args...)) + end +end + """ @print(items...) From 1400c9f061110e5e6732d66d7dfea2cf7914977e Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 7 Nov 2025 09:15:21 -0400 Subject: [PATCH 40/51] Naming --- src/intrinsics.jl | 12 ++++++------ src/pocl/backend.jl | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 172d1de1..5904d3a5 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -143,7 +143,7 @@ function _print end """ - Kernel{Backend, BKern} + Kernel{Backend, Kern} Kernel closure struct that is used to represent the backend kernel on the host. @@ -223,12 +223,12 @@ converted to a GPU-friendly format. function argconvert end """ - KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + KI.gpufunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} Low-level interface to compile a function invocation for the currently-active GPU, returning a callable kernel object. For a higher-level interface, use [`@kikernel`](@ref). -Currently, only `kifunction` only supports the `name` keyword argument as it is the only one +Currently, only `gpufunction` only supports the `name` keyword argument as it is the only one by all backends. Keyword arguments: @@ -237,10 +237,10 @@ Keyword arguments: !!! note Backend implementations **must** implement: ``` - kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + gpufunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} ``` """ -function kifunction end +function gpufunction end const MACRO_KWARGS = [:launch] const COMPILER_KWARGS = [:name] @@ -321,7 +321,7 @@ macro kikernel(backend, ex...) $kernel_f = $argconvert($backend, $f_var) $kernel_args = Base.map(x -> $argconvert($backend, x), ($(var_exprs...),)) $kernel_tt = Tuple{Base.map(Core.Typeof, $kernel_args)...} - $kernel = $kifunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...)) + $kernel = $gpufunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...)) if $launch $kernel($(var_exprs...); $(call_kwargs...)) end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index f5226bc7..913fbe37 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -141,7 +141,7 @@ end KI.argconvert(::POCLBackend, arg) = clconvert(arg) -function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT} +function KI.gpufunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT} kern = clfunction(f, tt; name, kwargs...) return KI.Kernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) end @@ -164,8 +164,8 @@ function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = nothing, workgro return obj.kern(args...; local_size, global_size) end -function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int - wginfo = cl.work_group_info(kikern.kern.fun, device()) +function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int + wginfo = cl.work_group_info(kernel.kern.fun, device()) return Int(min(wginfo.size, max_work_items)) end function KI.max_work_group_size(::POCLBackend)::Int From 055a81f04ec9d358e78b381a07b293710c5b48d0 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 7 Nov 2025 09:35:34 -0400 Subject: [PATCH 41/51] Reword --- src/intrinsics.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 5904d3a5..5960033c 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -211,8 +211,8 @@ multiprocessor_count(_) = 0 """ argconvert(::NewBackend, arg) -This function is called for every argument to be passed to a kernel, allowing it to be -converted to a GPU-friendly format. +This function is called for every argument to be passed to a kernel, +converting them to their device side representation. !!! note Backend implementations **must** implement: From 913f8c8b048e0d72065f9243df5a20784f08972f Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 7 Nov 2025 13:49:35 -0400 Subject: [PATCH 42/51] Docstrings --- src/intrinsics.jl | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 5960033c..fb1b00ad 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -1,3 +1,15 @@ +""" +# KernelIntrinics + +The `KernelIntrinics` module defines the API interface for backends to define various lower-level device and +host-side functionality. The `KernelIntrinsics` intrinsics are used to define the higher-level device-side +intrinsics functionality in `KernelAbstractions`. + +Both provide APIs for host and device-side functionality, but `KernelIntrinsics` focuses on on lower-level +functionality that is shared amongst backends, while `KernelAbstractions` provides higher-level functionality +such as writing kernels that work on arrays with an arbitrary number of dimensions, or convenience functions +like allocating arrays on a backend. +""" module KernelIntrinsics import ..KernelAbstractions: Backend @@ -111,6 +123,9 @@ After a `barrier()` call, all read and writes to global and local memory from each thread in the workgroup are visible in from all other threads in the workgroup. +This does **not** guarantee that a write from a thread in a certain workgroup will +be visible to a thread in a different workgroup. + !!! note `barrier()` must be encountered by all workitems of a work-group executing the kernel or by none at all. From 298e8078888ece5f2af69deb36ee0692ef0cb4a6 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 7 Nov 2025 14:03:50 -0400 Subject: [PATCH 43/51] Rename `kikernel` --- src/intrinsics.jl | 16 +++++++++++----- test/intrinsics.jl | 10 +++++----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index fb1b00ad..cd94d473 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -241,7 +241,7 @@ function argconvert end KI.gpufunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} Low-level interface to compile a function invocation for the currently-active GPU, returning -a callable kernel object. For a higher-level interface, use [`@kikernel`](@ref). +a callable kernel object. For a higher-level interface, use [`KernelIntrinsics.@kernel`](@ref). Currently, only `gpufunction` only supports the `name` keyword argument as it is the only one by all backends. @@ -262,24 +262,30 @@ const COMPILER_KWARGS = [:name] const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize] """ - @kikernel backend workgroupsize=... numworkgroups=... [kwargs...] func(args...) + KernelIntrinsics.@kernel backend workgroupsize=... numworkgroups=... [kwargs...] func(args...) High-level interface for executing code on a GPU. -The `@kikernel` macro should prefix a call, with `func` a callable function or object that +The `KernelIntrinsics.@kernel` macro should prefix a call, with `func` a callable function or object that should return nothing. It will be compiled to a function native to the specified `backend` upon first use, and to a certain extent arguments will be converted and managed automatically using `argconvert`. Finally, if `launch=true`, the newly created callable kernel object is called and launched according to the specified `backend`. -There are a few keyword arguments that influence the behavior of `@kikernel`: +There are a few keyword arguments that influence the behavior of `KernelIntrinsics.@kernel`: - `launch`: whether to launch this kernel, defaults to `true`. If `false`, the returned kernel object should be launched by calling it and passing arguments again. - `name`: the name of the kernel in the generated code. Defaults to an automatically- generated name. + +!!! note + `KernelIntrinsics.@kernel` differs from the `KernelAbstractions` macro in that this macro acts + a wrapper around backend kernel compilation/launching (such as `@cuda`, `@metal`, etc.). It is + used when calling a function to be run on a specific backend, while `KernelAbstractions.@kernel` + is used kernel definition for use with the original higher-level `KernelAbstractions` API. """ -macro kikernel(backend, ex...) +macro kernel(backend, ex...) call = ex[end] kwargs = map(ex[1:(end - 1)]) do kwarg if kwarg isa Symbol diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 911e544f..7a7ec1e0 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -31,13 +31,13 @@ function intrinsics_testsuite(backend, AT) return end arr1d = AT(zeros(Float32, 4)) - KI.@kikernel backend() numworkgroups = 2 workgroupsize = 2 launch_kernel1d(arr1d) + KI.@kernel backend() numworkgroups = 2 workgroupsize = 2 launch_kernel1d(arr1d) KernelAbstractions.synchronize(backend()) @test all(Array(arr1d) .== 1) # 1d tuple arr1dt = AT(zeros(Float32, 4)) - KI.@kikernel backend() numworkgroups = (2,) workgroupsize = (2,) launch_kernel1d(arr1dt) + KI.@kernel backend() numworkgroups = (2,) workgroupsize = (2,) launch_kernel1d(arr1dt) KernelAbstractions.synchronize(backend()) @test all(Array(arr1dt) .== 1) @@ -51,7 +51,7 @@ function intrinsics_testsuite(backend, AT) return end arr2d = AT(zeros(Float32, 4, 4)) - KI.@kikernel backend() numworkgroups = (2, 2) workgroupsize = (2, 2) launch_kernel2d(arr2d) + KI.@kernel backend() numworkgroups = (2, 2) workgroupsize = (2, 2) launch_kernel2d(arr2d) KernelAbstractions.synchronize(backend()) @test all(Array(arr2d) .== 1) @@ -65,7 +65,7 @@ function intrinsics_testsuite(backend, AT) return end arr3d = AT(zeros(Float32, 4, 4, 4)) - KI.@kikernel backend() numworkgroups = (2, 2, 2) workgroupsize = (2, 2, 2) launch_kernel3d(arr3d) + KI.@kernel backend() numworkgroups = (2, 2, 2) workgroupsize = (2, 2, 2) launch_kernel3d(arr3d) KernelAbstractions.synchronize(backend()) @test all(Array(arr3d) .== 1) end @@ -78,7 +78,7 @@ function intrinsics_testsuite(backend, AT) # Test with small kernel N = 16 results = AT(zeros(Int, 6, N)) - kernel = KI.@kikernel backend() launch = false test_intrinsics_kernel(results) + kernel = KI.@kernel backend() launch = false test_intrinsics_kernel(results) @test KI.kernel_max_work_group_size(kernel) isa Int @test KI.kernel_max_work_group_size(kernel; max_work_items = 1) == 1 From 42f17d6ee9766662732eb0de896d1c0ff5b37ad4 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 7 Nov 2025 17:17:29 -0400 Subject: [PATCH 44/51] Renaming --- src/KernelAbstractions.jl | 24 ++++++++++++------------ src/intrinsics.jl | 31 +++++++++++++++++-------------- src/pocl/backend.jl | 2 +- 3 files changed, 30 insertions(+), 27 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index fb48fd7b..a0f57c74 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -200,8 +200,8 @@ Abstract type for all KernelAbstractions backends. abstract type Backend end include("intrinsics.jl") -import .KernelIntrinsics -export KernelIntrinsics +import .KernelIntrinsics: KernelIntrinsics, KI +export KernelIntrinsics, KI ### # Kernel language @@ -370,7 +370,7 @@ macro context() end # Defined to keep cpu support for `__print` -@generated function KernelIntrinsics._print(items...) +@generated function KI._print(items...) str = "" args = [] @@ -489,25 +489,25 @@ end ### @inline function __index_Local_Linear(ctx) - return KernelIntrinsics.get_local_id().x + return KI.get_local_id().x end @inline function __index_Group_Linear(ctx) - return KernelIntrinsics.get_group_id().x + return KI.get_group_id().x end @inline function __index_Global_Linear(ctx) - return KernelIntrinsics.get_global_id().x + return KI.get_global_id().x end @inline function __index_Local_Cartesian(ctx) - return @inbounds workitems(__iterspace(ctx))[KernelIntrinsics.get_local_id().x] + return @inbounds workitems(__iterspace(ctx))[KI.get_local_id().x] end @inline function __index_Group_Cartesian(ctx) - return @inbounds blocks(__iterspace(ctx))[KernelIntrinsics.get_group_id().x] + return @inbounds blocks(__iterspace(ctx))[KI.get_group_id().x] end @inline function __index_Global_Cartesian(ctx) - return @inbounds expand(__iterspace(ctx), KernelIntrinsics.get_group_id().x, KernelIntrinsics.get_local_id().x) + return @inbounds expand(__iterspace(ctx), KI.get_group_id().x, KI.get_local_id().x) end @inline __index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...)) @@ -833,11 +833,11 @@ include("macros.jl") ### function Scratchpad end -SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims) +SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(t, dims) -__synchronize() = KernelIntrinsics.barrier() +__synchronize() = KI.barrier() -__print(args...) = KernelIntrinsics._print(args...) +__print(args...) = KI._print(args...) # Utils __size(args::Tuple) = Tuple{args...} diff --git a/src/intrinsics.jl b/src/intrinsics.jl index cd94d473..e0b2c46a 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -1,17 +1,20 @@ """ -# KernelIntrinics +# `KernelIntrinics`/`KI` -The `KernelIntrinics` module defines the API interface for backends to define various lower-level device and -host-side functionality. The `KernelIntrinsics` intrinsics are used to define the higher-level device-side +The `KernelIntrinics` (or `KI`) module defines the API interface for backends to define various lower-level device and +host-side functionality. The `KI` intrinsics are used to define the higher-level device-side intrinsics functionality in `KernelAbstractions`. -Both provide APIs for host and device-side functionality, but `KernelIntrinsics` focuses on on lower-level +Both provide APIs for host and device-side functionality, but `KI` focuses on on lower-level functionality that is shared amongst backends, while `KernelAbstractions` provides higher-level functionality such as writing kernels that work on arrays with an arbitrary number of dimensions, or convenience functions like allocating arrays on a backend. """ module KernelIntrinsics +const KI = KernelIntrinsics +export KI + import ..KernelAbstractions: Backend import GPUCompiler: split_kwargs, assign_args! @@ -238,12 +241,12 @@ converting them to their device side representation. function argconvert end """ - KI.gpufunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + KI.kernel_function(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} Low-level interface to compile a function invocation for the currently-active GPU, returning -a callable kernel object. For a higher-level interface, use [`KernelIntrinsics.@kernel`](@ref). +a callable kernel object. For a higher-level interface, use [`KI.@kernel`](@ref). -Currently, only `gpufunction` only supports the `name` keyword argument as it is the only one +Currently, `kernel_function` only supports the `name` keyword argument as it is the only one by all backends. Keyword arguments: @@ -252,27 +255,27 @@ Keyword arguments: !!! note Backend implementations **must** implement: ``` - gpufunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + kernel_function(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} ``` """ -function gpufunction end +function kernel_function end const MACRO_KWARGS = [:launch] const COMPILER_KWARGS = [:name] const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize] """ - KernelIntrinsics.@kernel backend workgroupsize=... numworkgroups=... [kwargs...] func(args...) + KI.@kernel backend workgroupsize=... numworkgroups=... [kwargs...] func(args...) High-level interface for executing code on a GPU. -The `KernelIntrinsics.@kernel` macro should prefix a call, with `func` a callable function or object that +The `KI.@kernel` macro should prefix a call, with `func` a callable function or object that should return nothing. It will be compiled to a function native to the specified `backend` upon first use, and to a certain extent arguments will be converted and managed automatically using `argconvert`. Finally, if `launch=true`, the newly created callable kernel object is called and launched according to the specified `backend`. -There are a few keyword arguments that influence the behavior of `KernelIntrinsics.@kernel`: +There are a few keyword arguments that influence the behavior of `KI.@kernel`: - `launch`: whether to launch this kernel, defaults to `true`. If `false`, the returned kernel object should be launched by calling it and passing arguments again. @@ -280,7 +283,7 @@ There are a few keyword arguments that influence the behavior of `KernelIntrinsi generated name. !!! note - `KernelIntrinsics.@kernel` differs from the `KernelAbstractions` macro in that this macro acts + `KI.@kernel` differs from the `KernelAbstractions` macro in that this macro acts a wrapper around backend kernel compilation/launching (such as `@cuda`, `@metal`, etc.). It is used when calling a function to be run on a specific backend, while `KernelAbstractions.@kernel` is used kernel definition for use with the original higher-level `KernelAbstractions` API. @@ -342,7 +345,7 @@ macro kernel(backend, ex...) $kernel_f = $argconvert($backend, $f_var) $kernel_args = Base.map(x -> $argconvert($backend, x), ($(var_exprs...),)) $kernel_tt = Tuple{Base.map(Core.Typeof, $kernel_args)...} - $kernel = $gpufunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...)) + $kernel = $kernel_function($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...)) if $launch $kernel($(var_exprs...); $(call_kwargs...)) end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 913fbe37..14fe6ae8 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -141,7 +141,7 @@ end KI.argconvert(::POCLBackend, arg) = clconvert(arg) -function KI.gpufunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT} +function KI.kernel_function(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT} kern = clfunction(f, tt; name, kwargs...) return KI.Kernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) end From da98ac896fad002655b2d62d8bafacd57f5778ed Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 11 Nov 2025 20:02:34 -0400 Subject: [PATCH 45/51] Update `unsafe_indices` examples to use KI directly --- examples/histogram.jl | 20 ++++++------ examples/performant_matmul.jl | 61 +++++++++++++++++------------------ 2 files changed, 39 insertions(+), 42 deletions(-) diff --git a/examples/histogram.jl b/examples/histogram.jl index 9edc0293..39f243c5 100644 --- a/examples/histogram.jl +++ b/examples/histogram.jl @@ -12,16 +12,15 @@ function create_histogram(input) return histogram_output end -# This a 1D histogram kernel where the histogramming happens on shmem -@kernel unsafe_indices = true function histogram_kernel!(histogram_output, input) - gid = @index(Group, Linear) - lid = @index(Local, Linear) +# This a 1D histogram kernel where the histogramming happens on static shmem +function histogram_kernel!(histogram_output, input, ::Val{gs}) where gs + gid = KI.get_group_id().x + lid = KI.get_local_id().x - @uniform gs = prod(@groupsize()) tid = (gid - 1) * gs + lid - @uniform N = length(histogram_output) + N = length(histogram_output) - shared_histogram = @localmem eltype(input) (gs) + shared_histogram = KI.localmemory(eltype(input), gs) # This will go through all input elements and assign them to a location in # shmem. Note that if there is not enough shem, we create different shmem @@ -32,7 +31,7 @@ end # Setting shared_histogram to 0 @inbounds shared_histogram[lid] = 0 - @synchronize() + KI.barrier() max_element = min_element + gs if max_element > N @@ -46,7 +45,7 @@ end @atomic shared_histogram[bin] += 1 end - @synchronize() + KI.barrier() if ((lid + min_element - 1) <= N) @atomic histogram_output[lid + min_element - 1] += shared_histogram[lid] @@ -59,8 +58,7 @@ end function histogram!(histogram_output, input, groupsize = 256) backend = get_backend(histogram_output) # Need static block size - kernel! = histogram_kernel!(backend, (groupsize,)) - kernel!(histogram_output, input, ndrange = size(input)) + KI.@kernel backend workgroupsize=groupsize numworkgroups=cld(length(input), groupsize) histogram_kernel!(histogram_output, input, Val(groupsize)) return end diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl index ac56edb5..f19fe75b 100644 --- a/examples/performant_matmul.jl +++ b/examples/performant_matmul.jl @@ -9,70 +9,68 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # # Metal sometimes supports fewer. const TILE_DIM = 16 -@kernel unsafe_indices = true function coalesced_matmul_kernel!( - output, @Const(input1), @Const(input2), N, R, M, - ::Val{BANK} = Val(1), - ) where {BANK} - gi, gj = @index(Group, NTuple) - i, j = @index(Local, NTuple) - - TILE_DIM = @uniform @groupsize()[1] +function coalesced_matmul_kernel!( + output, input1, input2, N, R, M, + ::Val{TDIM}, ::Val{BANK} = Val(1) + ) where {TDIM, BANK} + gi, gj, _ = KI.get_group_id() + i, j, _ = KI.get_local_id() # +1 to avoid bank conflicts on shared memory - tile1 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM) - tile2 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM) + tile1 = KI.localmemory(eltype(output), (TDIM + BANK, TDIM)) + tile2 = KI.localmemory(eltype(output), (TDIM + BANK, TDIM)) - # private variable for tile output - outval = @private eltype(output) 1 - @inbounds outval[1] = -zero(eltype(output)) + # variable for tile output + outval = -zero(eltype(output)) - @uniform N = size(output, 1) + N = size(output, 1) # number of tiles depends on inner dimension - @uniform NUM_TILES = div(R + TILE_DIM - 1, TILE_DIM) + NUM_TILES = div(R + TDIM - 1, TDIM) # loop over all tiles needed for this calculation for t in 0:(NUM_TILES - 1) # Can't use @index(Global), because we use a smaller ndrange - I = (gi - 1) * TILE_DIM + i - J = (gj - 1) * TILE_DIM + j + I = (gi - 1) * TDIM + i + J = (gj - 1) * TDIM + j # load inputs into tiles, with bounds checking for non-square matrices - if I <= N && t * TILE_DIM + j <= R - @inbounds tile1[i, j] = input1[I, t * TILE_DIM + j] + if I <= N && t * TDIM + j <= R + @inbounds tile1[i, j] = input1[I, t * TDIM + j] else @inbounds tile1[i, j] = 0.0 end if t * TILE_DIM + i <= R && J <= M - @inbounds tile2[i, j] = input2[t * TILE_DIM + i, J] + @inbounds tile2[i, j] = input2[t * TDIM + i, J] else @inbounds tile2[i, j] = 0.0 end # wait for all tiles to be loaded - @synchronize + KI.barrier() # get global values again - I = (gi - 1) * TILE_DIM + i - J = (gj - 1) * TILE_DIM + j + I = (gi - 1) * TDIM + i + J = (gj - 1) * TDIM + j # calculate value of spot in output, use temporary value to allow for vectorization out = zero(eltype(output)) - @simd for k in 1:TILE_DIM + @simd for k in 1:TDIM @inbounds out += tile1[i, k] * tile2[k, j] end - outval[1] += out + outval += out - @synchronize + KI.barrier() end # get global indices again - I = (gi - 1) * TILE_DIM + i - J = (gj - 1) * TILE_DIM + j + I = (gi - 1) * TDIM + i + J = (gj - 1) * TDIM + j # save if inbounds if I <= N && J <= M - @inbounds output[I, J] = outval[1] + @inbounds output[I, J] = outval end + return nothing end N = 1024 @@ -82,9 +80,10 @@ A = rand!(allocate(backend, Float32, N, R)) B = rand!(allocate(backend, Float32, R, M)) C = KernelAbstractions.zeros(backend, Float32, N, M) -kern = coalesced_matmul_kernel!(backend, (TILE_DIM, TILE_DIM)) +workgroupsize=(TILE_DIM, TILE_DIM) +numworkgroups=(cld(size(C,1), TILE_DIM), cld(size(C,2), TILE_DIM)) -kern(C, A, B, N, R, M, ndrange = size(C)) +KI.@kernel backend workgroupsize numworkgroups coalesced_matmul_kernel!(C, A, B, N, R, M, Val(TILE_DIM)) KernelAbstractions.synchronize(backend) @test isapprox(A * B, C) From 8510dfe46a1888c2362c045176b16f4451376abe Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 11 Nov 2025 20:08:22 -0400 Subject: [PATCH 46/51] Format --- examples/histogram.jl | 4 ++-- examples/performant_matmul.jl | 2 +- src/intrinsics.jl | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/histogram.jl b/examples/histogram.jl index 39f243c5..273c6d71 100644 --- a/examples/histogram.jl +++ b/examples/histogram.jl @@ -13,7 +13,7 @@ function create_histogram(input) end # This a 1D histogram kernel where the histogramming happens on static shmem -function histogram_kernel!(histogram_output, input, ::Val{gs}) where gs +function histogram_kernel!(histogram_output, input, ::Val{gs}) where {gs} gid = KI.get_group_id().x lid = KI.get_local_id().x @@ -58,7 +58,7 @@ end function histogram!(histogram_output, input, groupsize = 256) backend = get_backend(histogram_output) # Need static block size - KI.@kernel backend workgroupsize=groupsize numworkgroups=cld(length(input), groupsize) histogram_kernel!(histogram_output, input, Val(groupsize)) + KI.@kernel backend workgroupsize = groupsize numworkgroups = cld(length(input), groupsize) histogram_kernel!(histogram_output, input, Val(groupsize)) return end diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl index f19fe75b..d9f1d063 100644 --- a/examples/performant_matmul.jl +++ b/examples/performant_matmul.jl @@ -81,7 +81,7 @@ B = rand!(allocate(backend, Float32, R, M)) C = KernelAbstractions.zeros(backend, Float32, N, M) workgroupsize=(TILE_DIM, TILE_DIM) -numworkgroups=(cld(size(C,1), TILE_DIM), cld(size(C,2), TILE_DIM)) +numworkgroups=(cld(size(C, 1), TILE_DIM), cld(size(C, 2), TILE_DIM)) KI.@kernel backend workgroupsize numworkgroups coalesced_matmul_kernel!(C, A, B, N, R, M, Val(TILE_DIM)) KernelAbstractions.synchronize(backend) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index e0b2c46a..2025bf5e 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -159,7 +159,6 @@ end function _print end - """ Kernel{Backend, Kern} From 89b57e923b3d86dc285c227fba7e9952f998170b Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 11 Nov 2025 20:48:20 -0400 Subject: [PATCH 47/51] Format --- examples/performant_matmul.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl index d9f1d063..0eee6e3c 100644 --- a/examples/performant_matmul.jl +++ b/examples/performant_matmul.jl @@ -80,8 +80,8 @@ A = rand!(allocate(backend, Float32, N, R)) B = rand!(allocate(backend, Float32, R, M)) C = KernelAbstractions.zeros(backend, Float32, N, M) -workgroupsize=(TILE_DIM, TILE_DIM) -numworkgroups=(cld(size(C, 1), TILE_DIM), cld(size(C, 2), TILE_DIM)) +workgroupsize = (TILE_DIM, TILE_DIM) +numworkgroups = (cld(size(C, 1), TILE_DIM), cld(size(C, 2), TILE_DIM)) KI.@kernel backend workgroupsize numworkgroups coalesced_matmul_kernel!(C, A, B, N, R, M, Val(TILE_DIM)) KernelAbstractions.synchronize(backend) From ec246c711dfbd06de72476c610b305a1e327538d Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 11 Nov 2025 20:53:30 -0400 Subject: [PATCH 48/51] Format --- examples/histogram.jl | 2 +- examples/performant_matmul.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/histogram.jl b/examples/histogram.jl index 273c6d71..29d3bf57 100644 --- a/examples/histogram.jl +++ b/examples/histogram.jl @@ -52,7 +52,7 @@ function histogram_kernel!(histogram_output, input, ::Val{gs}) where {gs} end end - + return end function histogram!(histogram_output, input, groupsize = 256) diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl index 0eee6e3c..905b3a05 100644 --- a/examples/performant_matmul.jl +++ b/examples/performant_matmul.jl @@ -70,7 +70,7 @@ function coalesced_matmul_kernel!( if I <= N && J <= M @inbounds output[I, J] = outval end - return nothing + return end N = 1024 From 5620ba2d6ee6e774fbcc7b9680258075c13fe666 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 12 Nov 2025 17:11:36 -0400 Subject: [PATCH 49/51] Add kernel launch test --- src/intrinsics.jl | 17 +++++++++++++++-- src/pocl/backend.jl | 19 ++++++------------- test/intrinsics.jl | 4 ++++ 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 2025bf5e..d7c91fbc 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -168,15 +168,28 @@ kernel on the host. !!! note Backend implementations **must** implement: ``` - (kernel::Kernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...) + (kernel::Kernel{<:NewBackend})(args...; numworkgroups=1, workgroupsize=1) ``` - As well as the on-device functionality. + With the `numworkgroups` and `workgroupsize` arguments accepting a scalar Integer + or or a 1, 2, or 3 Integer tuple and throwing an `ArgumentError` otherwise. The + helper function `KI.check_launch_args(numworkgroups, workgroupsize)` can be used + by the backend or a custom check can be implemented. + + Backends must also implement the on-device kernel launch functionality. """ struct Kernel{B, Kern} backend::B kern::Kern end +function check_launch_args(numworkgroups, workgroupsize) + length(numworkgroups) <= 3 || + throw(ArgumentError("`numworkgroups` only accepts up to 3 dimensions")) + length(workgroupsize) <= 3 || + throw(ArgumentError("`workgroupsize` only accepts up to 3 dimensions")) + return +end + """ kernel_max_work_group_size(kern; [max_work_items::Int])::Int diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 14fe6ae8..371891d5 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -146,20 +146,13 @@ function KI.kernel_function(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothin return KI.Kernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) end -function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing) - local_size = StaticArrays.MVector{3}((1, 1, 1)) - if !isnothing(workgroupsize) - for (i, val) in enumerate(workgroupsize) - local_size[i] = val - end - end +function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = 1, workgroupsize = 1) + KI.check_launch_args(numworkgroups, workgroupsize) - global_size = StaticArrays.MVector{3}((1, 1, 1)) - if !isnothing(numworkgroups) - for (i, val) in enumerate(numworkgroups) - global_size[i] = val * local_size[i] - end - end + local_size = (workgroupsize..., ntuple(_->1, 3-length(workgroupsize))...,) + + numworkgroups = (numworkgroups..., ntuple(_->1, 3-length(numworkgroups))...,) + global_size = local_size .* numworkgroups return obj.kern(args...; local_size, global_size) end diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 7a7ec1e0..dff6751e 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -68,6 +68,10 @@ function intrinsics_testsuite(backend, AT) KI.@kernel backend() numworkgroups = (2, 2, 2) workgroupsize = (2, 2, 2) launch_kernel3d(arr3d) KernelAbstractions.synchronize(backend()) @test all(Array(arr3d) .== 1) + + # 4d (Errors) + @test_throws ArgumentError (KI.@kernel backend() numworkgroups = (2, 2, 2, 2) workgroupsize = (2, 2, 2) launch_kernel3d(arr3d)) + @test_throws ArgumentError (KI.@kernel backend() numworkgroups = (2, 2, 2) workgroupsize = (2, 2, 2, 2) launch_kernel3d(arr3d)) end @testset "Basic intrinsics functionality" begin From b0013f112172044a8d9ff137db9074956abf4086 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 12 Nov 2025 22:32:04 -0400 Subject: [PATCH 50/51] Synchronize --- src/pocl/backend.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 371891d5..055e1c7d 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -154,7 +154,10 @@ function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = 1, workgroupsize numworkgroups = (numworkgroups..., ntuple(_->1, 3-length(numworkgroups))...,) global_size = local_size .* numworkgroups - return obj.kern(args...; local_size, global_size) + event = obj.kern(args...; local_size, global_size) + wait(event) + cl.clReleaseEvent(event) + return nothing end function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int From da1d96dc1c931c652a49b7426d7a99c8324fa534 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sun, 16 Nov 2025 19:08:31 +0100 Subject: [PATCH 51/51] don't export KI and fix POCL.synchronize --- examples/histogram.jl | 2 ++ examples/performant_matmul.jl | 3 +++ src/KernelAbstractions.jl | 4 ++-- src/intrinsics.jl | 7 ++----- src/pocl/backend.jl | 13 ++++++++++--- test/intrinsics.jl | 2 ++ 6 files changed, 21 insertions(+), 10 deletions(-) diff --git a/examples/histogram.jl b/examples/histogram.jl index 29d3bf57..f2704fd0 100644 --- a/examples/histogram.jl +++ b/examples/histogram.jl @@ -1,6 +1,8 @@ # INCLUDE ROCM using KernelAbstractions, Test using KernelAbstractions: @atomic, @atomicswap, @atomicreplace +import KernelAbstractions.KernelIntrinsics as KI + include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend # Function to use as a baseline for CPU metrics diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl index 905b3a05..0a279d32 100644 --- a/examples/performant_matmul.jl +++ b/examples/performant_matmul.jl @@ -1,7 +1,10 @@ using KernelAbstractions +import KernelAbstractions.KernelIntrinsics as KI + using StaticArrays using Test using Random + include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend # We use a TILE_DIM of 16 as a safe value since while diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index a0f57c74..cdb4dd96 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -200,8 +200,8 @@ Abstract type for all KernelAbstractions backends. abstract type Backend end include("intrinsics.jl") -import .KernelIntrinsics: KernelIntrinsics, KI -export KernelIntrinsics, KI +import .KernelIntrinsics as KI +export KernelIntrinsics ### # Kernel language diff --git a/src/intrinsics.jl b/src/intrinsics.jl index d7c91fbc..80ccf6f0 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -1,7 +1,7 @@ """ -# `KernelIntrinics`/`KI` +# `KernelIntrinsics` -The `KernelIntrinics` (or `KI`) module defines the API interface for backends to define various lower-level device and +The `KernelIntrinsics` (or `KI`) module defines the API interface for backends to define various lower-level device and host-side functionality. The `KI` intrinsics are used to define the higher-level device-side intrinsics functionality in `KernelAbstractions`. @@ -12,9 +12,6 @@ like allocating arrays on a backend. """ module KernelIntrinsics -const KI = KernelIntrinsics -export KI - import ..KernelAbstractions: Backend import GPUCompiler: split_kwargs, assign_args! diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 055e1c7d..f23e20b0 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -57,7 +57,14 @@ KA.functional(::POCLBackend) = true KA.pagelock!(::POCLBackend, x) = nothing KA.get_backend(::Array) = POCLBackend() -KA.synchronize(::POCLBackend) = cl.finish(cl.queue()) + +## Implementation note: +## The POCL backend uses `Base.Array` as it's array type, so the external operations +## `broadcast`, `*` and other high-level operations are handled by Julia. In order +## to provide the same memory synchronization semantics as other backends, we +## must synchronize upon kernel launch and can't rely on synchronization upon +## array access. Therefore, `synchronize` is a no-op. +KA.synchronize(::POCLBackend) = nothing KA.supports_float64(::POCLBackend) = true KA.supports_unified(::POCLBackend) = true @@ -149,9 +156,9 @@ end function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = 1, workgroupsize = 1) KI.check_launch_args(numworkgroups, workgroupsize) - local_size = (workgroupsize..., ntuple(_->1, 3-length(workgroupsize))...,) + local_size = (workgroupsize..., ntuple(_ -> 1, 3 - length(workgroupsize))...) - numworkgroups = (numworkgroups..., ntuple(_->1, 3-length(numworkgroups))...,) + numworkgroups = (numworkgroups..., ntuple(_ -> 1, 3 - length(numworkgroups))...) global_size = local_size .* numworkgroups event = obj.kern(args...; local_size, global_size) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index dff6751e..97548c47 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -1,3 +1,5 @@ +import KernelAbstractions.KernelIntrinsics as KI + function test_intrinsics_kernel(results) # Test all intrinsics return NamedTuples with x, y, z fields global_size = KI.get_global_size()