From 70ef83a06e1c267dcc063d3c68454367efaadde6 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 4 Feb 2025 15:50:59 +0100
Subject: [PATCH 01/51] define basic intrinsics

---
 src/KernelAbstractions.jl | 30 +++++++++++++++++-----
 src/intrinsics.jl         | 52 +++++++++++++++++++++++++++++++++++++++
 src/pocl/backend.jl       | 25 ++++++-------------
 3 files changed, 83 insertions(+), 24 deletions(-)
 create mode 100644 src/intrinsics.jl

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 9af2d9d4..8c2844fd 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -194,6 +194,10 @@ function unsafe_free! end
 
 unsafe_free!(::AbstractArray) = return
 
+include("intrinsics.jl")
+import .KernelIntrinsics
+export KernelIntrinsics
+
 ###
 # Kernel language
 # - @localmem
@@ -460,13 +464,27 @@ end
 # Internal kernel functions
 ###
 
-function __index_Local_Linear end
-function __index_Group_Linear end
-function __index_Global_Linear end
+function __index_Local_Linear(ctx)
+    return KernelIntrinsics.get_local_id().x
+end
 
-function __index_Local_Cartesian end
-function __index_Group_Cartesian end
-function __index_Global_Cartesian end
+function __index_Group_Linear(ctx)
+    return KernelIntrinsics.get_group_id().x
+end
+
+function __index_Global_Linear(ctx)
+    return KernelIntrinsics.get_global_id().x
+end
+
+function __index_Local_Cartesian(ctx)
+    return @inbounds workitems(__iterspace(ctx))[KernelIntrinsics.get_local_id().x]
+end
+function __index_Group_Cartesian(ctx)
+    return @inbounds blocks(__iterspace(ctx))[KernelIntrinsics.get_group_id().x]
+end
+function __index_Global_Cartesian(ctx)
+    return @inbounds expand(__iterspace(ctx), KernelIntrinsics.get_group_id().x, KernelIntrinsics.get_local_id().x)
+end
 
 @inline __index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...))
 @inline __index_Group_NTuple(ctx, I...) = Tuple(__index_Group_Cartesian(ctx, I...))
diff --git a/src/intrinsics.jl b/src/intrinsics.jl
new file mode 100644
index 00000000..33ed56fe
--- /dev/null
+++ b/src/intrinsics.jl
@@ -0,0 +1,52 @@
+module KernelIntrinsics
+
+"""
+    get_global_size()::@NamedTuple{x::Int32, y::Int32, z::Int32}
+
+Return the number of global work-items specified.
+
+!!! note
+    1-based.
+"""
+function get_global_size end
+
+"""
+    get_global_id()::@NamedTuple{x::Int32, y::Int32, z::Int32}
+
+Returns the unique global work-item ID.
+"""
+function get_global_id end
+
+"""
+    get_local_size()::@NamedTuple{x::Int32, y::Int32, z::Int32}
+
+Return the number of local work-items specified.
+"""
+function get_local_size end
+
+"""
+    get_local_id()::@NamedTuple{x::Int32, y::Int32, z::Int32}
+
+Returns the unique local work-item ID.
+"""
+function get_local_id end
+
+"""
+    get_num_groups()::@NamedTuple{x::Int32, y::Int32, z::Int32}
+
+Returns the number of groups.
+"""
+function get_num_groups end
+
+"""
+    get_group_id()::@NamedTuple{x::Int32, y::Int32, z::Int32}
+
+Returns the unique group ID.
+"""
+function get_group_id end
+
+function localmemory end
+function barrier end
+function print end
+
+end
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 87fccdb9..32c2951a 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -140,29 +140,18 @@ end
 
 
 ## Indexing Functions
+const KI = KA.KernelIntrinsics
 
-@device_override @inline function KA.__index_Local_Linear(ctx)
-    return get_local_id(1)
+@device_override @inline function KI.get_local_id()
+    return (; x = get_local_id(1), y = get_local_id(2), z = get_local_id(3))
 end
 
-@device_override @inline function KA.__index_Group_Linear(ctx)
-    return get_group_id(1)
+@device_override @inline function KI.get_group_id()
+    return (; x = get_group_id(1), y = get_group_id(2), z = get_group_id(3))
 end
 
-@device_override @inline function KA.__index_Global_Linear(ctx)
-    return get_global_id(1)
-end
-
-@device_override @inline function KA.__index_Local_Cartesian(ctx)
-    @inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)]
-end
-
-@device_override @inline function KA.__index_Group_Cartesian(ctx)
-    @inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)]
-end
-
-@device_override @inline function KA.__index_Global_Cartesian(ctx)
-    return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
+@device_override @inline function KI.get_global_id()
+    return (; x = get_global_id(1), y = get_global_id(2), z = get_global_id(3))
 end
 
 @device_override @inline function KA.__validindex(ctx)

From 02f3124edc32627324c359779e05dad94a5e261d Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 16 Sep 2025 12:37:32 -0300
Subject: [PATCH 02/51] Fix docstrings

---
 src/intrinsics.jl | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 33ed56fe..15202fab 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -4,9 +4,6 @@ module KernelIntrinsics
     get_global_size()::@NamedTuple{x::Int32, y::Int32, z::Int32}
 
 Return the number of global work-items specified.
-
-!!! note
-    1-based.
 """
 function get_global_size end
 
@@ -14,6 +11,9 @@ function get_global_size end
     get_global_id()::@NamedTuple{x::Int32, y::Int32, z::Int32}
 
 Returns the unique global work-item ID.
+
+!!! note
+    1-based.
 """
 function get_global_id end
 
@@ -28,6 +28,9 @@ function get_local_size end
     get_local_id()::@NamedTuple{x::Int32, y::Int32, z::Int32}
 
 Returns the unique local work-item ID.
+
+!!! note
+    1-based.
 """
 function get_local_id end
 
@@ -42,6 +45,9 @@ function get_num_groups end
     get_group_id()::@NamedTuple{x::Int32, y::Int32, z::Int32}
 
 Returns the unique group ID.
+
+!!! note
+    1-based.
 """
 function get_group_id end
 

From 1103574e4c44eddd2beab10ed8a15ee7ac8fdec6 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 16 Sep 2025 13:51:00 -0300
Subject: [PATCH 03/51] localmemory

---
 src/KernelAbstractions.jl | 4 +++-
 src/pocl/backend.jl       | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 8c2844fd..f5f76306 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -814,7 +814,9 @@ include("macros.jl")
 ###
 
 function Scratchpad end
-function SharedMemory end
+function SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id}
+    KernelIntrinsics.localmemory(t, dims, id)
+end
 
 function __synchronize()
     error("@synchronize used outside kernel or not captured")
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 32c2951a..c06d014c 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -166,7 +166,7 @@ end
 
 ## Shared and Scratch Memory
 
-@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
     ptr = POCL.emit_localmemory(T, Val(prod(Dims)))
     CLDeviceArray(Dims, ptr)
 end

From 8f404522b3809917498784ba56b8980d39423f26 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 18 Sep 2025 15:30:33 -0300
Subject: [PATCH 04/51] Move default barrier implementation to KernelIntrinsics

---
 src/KernelAbstractions.jl | 2 +-
 src/intrinsics.jl         | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index f5f76306..cd3d1fc9 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -819,7 +819,7 @@ function SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims,
 end
 
 function __synchronize()
-    error("@synchronize used outside kernel or not captured")
+    KernelIntrinsics.barrier()
 end
 
 @generated function __print(items...)
diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 15202fab..32cc125d 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -52,7 +52,9 @@ Returns the unique group ID.
 function get_group_id end
 
 function localmemory end
-function barrier end
+function barrier()
+    error("Group barrier used outside kernel or not captured")
+end
 function print end
 
 end

From 471dc5857a1d867e42be79e719f5fb4d02d07117 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 18 Sep 2025 15:36:22 -0300
Subject: [PATCH 05/51] Implement remaining intrinsics for POCL

---
 src/pocl/backend.jl | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index c06d014c..87345897 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -154,6 +154,18 @@ end
     return (; x = get_global_id(1), y = get_global_id(2), z = get_global_id(3))
 end
 
+@device_override @inline function KI.get_local_size()
+    return (; x = get_local_size(1), y = get_local_size(2), z = get_local_size(3))
+end
+
+@device_override @inline function KI.get_num_groups()
+    return (; x = get_num_groups(1), y = get_num_groups(2), z = get_num_groups(3))
+end
+
+@device_override @inline function KI.get_global_size()
+    return (; x = get_global_size(1), y = get_global_size(2), z = get_global_size(3))
+end
+
 @device_override @inline function KA.__validindex(ctx)
     if KA.__dynamic_checkbounds(ctx)
         I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
@@ -178,7 +190,7 @@ end
 
 ## Synchronization and Printing
 
-@device_override @inline function KA.__synchronize()
+@device_override @inline function KI.barrier()
     work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE)
 end
 

From a8a14a1595b4501e3296b58e9ade7d05d615499b Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 18 Sep 2025 17:23:11 -0300
Subject: [PATCH 06/51] Add basic tests

Modified from initial Claude code
---
 test/intrinsics.jl | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 test/testsuite.jl  |  5 ++++
 2 files changed, 67 insertions(+)
 create mode 100644 test/intrinsics.jl

diff --git a/test/intrinsics.jl b/test/intrinsics.jl
new file mode 100644
index 00000000..693b2bd9
--- /dev/null
+++ b/test/intrinsics.jl
@@ -0,0 +1,62 @@
+
+@kernel cpu = false inbounds = true unsafe_indices = true function test_intrinsics_kernel(results)
+    # Test all intrinsics return NamedTuples with x, y, z fields
+    global_size = KernelIntrinsics.get_global_size()
+    global_id = KernelIntrinsics.get_global_id()
+    local_size = KernelIntrinsics.get_local_size()
+    local_id = KernelIntrinsics.get_local_id()
+    num_groups = KernelIntrinsics.get_num_groups()
+    group_id = KernelIntrinsics.get_group_id()
+
+    if UInt32(global_id.x) <= UInt32(global_size.x)
+        results[1, global_id.x] = global_id.x
+        results[2, global_id.x] = local_id.x
+        results[3, global_id.x] = group_id.x
+        results[4, global_id.x] = global_size.x
+        results[5, global_id.x] = local_size.x
+        results[6, global_id.x] = num_groups.x
+    end
+end
+
+
+function intrinsics_testsuite(backend, AT)
+    @testset "KernelIntrinsics Tests" begin
+        @testset "Basic intrinsics functionality" begin
+
+            # Test with small kernel
+            N = 16
+            results = AT(zeros(UInt32, 6, N))
+
+            kernel = test_intrinsics_kernel(backend(), 4, (N,))
+            kernel(results, ndrange = N)
+            KernelAbstractions.synchronize(backend())
+
+            host_results = Array(results)
+
+            # Verify results make sense
+            for i in 1:N
+                global_id_x, local_id_x, group_id_x, global_size_x, local_size_x, num_groups_x = host_results[:, i]
+
+                # Global IDs should be 1-based and sequential
+                @test global_id_x == i
+
+                # Global size should match our ndrange
+                @test global_size_x == N
+
+                # Local size should be 4 (our workgroupsize)
+                @test local_size_x == 4
+
+                # Number of groups should be ceil(N/4) = 4
+                @test num_groups_x == 4
+
+                # Group ID should be 1-based
+                expected_group = div(i - 1, 4) + 1
+                @test group_id_x == expected_group
+
+                # Local ID should be 1-based within group
+                expected_local = ((i - 1) % 4) + 1
+                @test local_id_x == expected_local
+            end
+        end
+    end
+end
diff --git a/test/testsuite.jl b/test/testsuite.jl
index 2418db99..3426e542 100644
--- a/test/testsuite.jl
+++ b/test/testsuite.jl
@@ -26,6 +26,7 @@ end
 
 
 include("test.jl")
+include("intrinsics.jl")
 include("localmem.jl")
 include("private.jl")
 include("unroll.jl")
@@ -47,6 +48,10 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{
         specialfunctions_testsuite(backend)
     end
 
+    @conditional_testset "Intrinsics" skip_tests begin
+        intrinsics_testsuite(backend, AT)
+    end
+
     @conditional_testset "Localmem" skip_tests begin
         localmem_testsuite(backend, AT)
     end

From 96d9ffcd8b51278936485e25cb9d3151e0cd58a4 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 1 Oct 2025 13:24:43 -0300
Subject: [PATCH 07/51] Int32 -> Int

https://github.com/JuliaGPU/KernelAbstractions.jl/pull/562/files#r1958255435
---
 src/intrinsics.jl   | 12 ++++++------
 src/pocl/backend.jl | 12 ++++++------
 test/intrinsics.jl  |  2 +-
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 32cc125d..06011620 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -1,14 +1,14 @@
 module KernelIntrinsics
 
 """
-    get_global_size()::@NamedTuple{x::Int32, y::Int32, z::Int32}
+    get_global_size()::@NamedTuple{x::Int, y::Int, z::Int}
 
 Return the number of global work-items specified.
 """
 function get_global_size end
 
 """
-    get_global_id()::@NamedTuple{x::Int32, y::Int32, z::Int32}
+    get_global_id()::@NamedTuple{x::Int, y::Int, z::Int}
 
 Returns the unique global work-item ID.
 
@@ -18,14 +18,14 @@ Returns the unique global work-item ID.
 function get_global_id end
 
 """
-    get_local_size()::@NamedTuple{x::Int32, y::Int32, z::Int32}
+    get_local_size()::@NamedTuple{x::Int, y::Int, z::Int}
 
 Return the number of local work-items specified.
 """
 function get_local_size end
 
 """
-    get_local_id()::@NamedTuple{x::Int32, y::Int32, z::Int32}
+    get_local_id()::@NamedTuple{x::Int, y::Int, z::Int}
 
 Returns the unique local work-item ID.
 
@@ -35,14 +35,14 @@ Returns the unique local work-item ID.
 function get_local_id end
 
 """
-    get_num_groups()::@NamedTuple{x::Int32, y::Int32, z::Int32}
+    get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int}
 
 Returns the number of groups.
 """
 function get_num_groups end
 
 """
-    get_group_id()::@NamedTuple{x::Int32, y::Int32, z::Int32}
+    get_group_id()::@NamedTuple{x::Int, y::Int, z::Int}
 
 Returns the unique group ID.
 
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 87345897..ef0aba6b 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -143,27 +143,27 @@ end
 const KI = KA.KernelIntrinsics
 
 @device_override @inline function KI.get_local_id()
-    return (; x = get_local_id(1), y = get_local_id(2), z = get_local_id(3))
+    return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3)))
 end
 
 @device_override @inline function KI.get_group_id()
-    return (; x = get_group_id(1), y = get_group_id(2), z = get_group_id(3))
+    return (; x = Int(get_group_id(1)), y = Int(get_group_id(2)), z = Int(get_group_id(3)))
 end
 
 @device_override @inline function KI.get_global_id()
-    return (; x = get_global_id(1), y = get_global_id(2), z = get_global_id(3))
+    return (; x = Int(get_global_id(1)), y = Int(get_global_id(2)), z = Int(get_global_id(3)))
 end
 
 @device_override @inline function KI.get_local_size()
-    return (; x = get_local_size(1), y = get_local_size(2), z = get_local_size(3))
+    return (; x = Int(get_local_size(1)), y = Int(get_local_size(2)), z = Int(get_local_size(3)))
 end
 
 @device_override @inline function KI.get_num_groups()
-    return (; x = get_num_groups(1), y = get_num_groups(2), z = get_num_groups(3))
+    return (; x = Int(get_num_groups(1)), y = Int(get_num_groups(2)), z = Int(get_num_groups(3)))
 end
 
 @device_override @inline function KI.get_global_size()
-    return (; x = get_global_size(1), y = get_global_size(2), z = get_global_size(3))
+    return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3)))
 end
 
 @device_override @inline function KA.__validindex(ctx)
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index 693b2bd9..e58b58a6 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -25,7 +25,7 @@ function intrinsics_testsuite(backend, AT)
 
             # Test with small kernel
             N = 16
-            results = AT(zeros(UInt32, 6, N))
+            results = AT(zeros(Int, 6, N))
 
             kernel = test_intrinsics_kernel(backend(), 4, (N,))
             kernel(results, ndrange = N)

From 685a7d48165e838e9c863670c9ec17e3a476f9bd Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 2 Oct 2025 16:58:13 -0300
Subject: [PATCH 08/51] Format

---
 src/KernelAbstractions.jl | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index cd3d1fc9..138c63c5 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -814,13 +814,9 @@ include("macros.jl")
 ###
 
 function Scratchpad end
-function SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id}
-    KernelIntrinsics.localmemory(t, dims, id)
-end
+SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id)
 
-function __synchronize()
-    KernelIntrinsics.barrier()
-end
+__synchronize() = KernelIntrinsics.barrier()
 
 @generated function __print(items...)
     str = ""

From a54fb5d674fc2b016a25b77f3d8f647fe1242917 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 00:45:13 -0300
Subject: [PATCH 09/51] Launch interface

---
 src/KernelAbstractions.jl |  7 ++++---
 src/intrinsics.jl         | 39 ++++++++++++++++++++++++++++++++++++++-
 src/pocl/backend.jl       |  2 +-
 3 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 138c63c5..7b626618 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -244,10 +244,10 @@ Declare storage that is local to a workgroup.
 """
 macro localmem(T, dims)
     # Stay in sync with CUDAnative
-    id = gensym("static_shmem")
+    # id = gensym("static_shmem")
 
     return quote
-        $SharedMemory($(esc(T)), Val($(esc(dims))), Val($(QuoteNode(id))))
+        $SharedMemory($(esc(T)), Val($(esc(dims))))#, Val($(QuoteNode(id))))
     end
 end
 
@@ -814,7 +814,8 @@ include("macros.jl")
 ###
 
 function Scratchpad end
-SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id)
+# SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id)
+SharedMemory(t::Type{T}, dims::Val{Dims}) where {T, Dims} = KernelIntrinsics.localmemory(t, dims)
 
 __synchronize() = KernelIntrinsics.barrier()
 
diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 06011620..77bbf2a3 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -51,10 +51,47 @@ Returns the unique group ID.
 """
 function get_group_id end
 
-function localmemory end
+"""
+    localmemory(T, dims)
+
+Declare memory that is local to a workgroup.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    localmemory(T::DataType, ::Val{Dims}) where {T, Dims}
+    ```
+    As well as the on-device functionality.
+"""
+localmemory(::Type{T}, dims) where T = localmemory(T, Val(dims))
+# @inline localmemory(::Type{T}, dims::Val{Dims}) where {T, Dims} = localmemory(T, dims, Val(gensym("static_shmem")))
+
 function barrier()
     error("Group barrier used outside kernel or not captured")
 end
 function print end
 
+
+"""
+    KIKernel{Backend, BKern}
+
+KIKernel closure struct that is used to represent the backend
+kernel on the host.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    KI.KIKernel(::NewBackend, f, args...; kwargs...)
+    (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing)
+    ```
+    As well as the on-device functionality.
+"""
+struct KIKernel{Backend, BKern}
+    backend::Backend
+    kern::BKern
+end
+
+function kernel_max_work_group_size end
+function max_work_group_size end
+function multiprocessor_count end
 end
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index ef0aba6b..4a50cc7e 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -178,7 +178,7 @@ end
 
 ## Shared and Scratch Memory
 
-@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims}
     ptr = POCL.emit_localmemory(T, Val(prod(Dims)))
     CLDeviceArray(Dims, ptr)
 end

From 5a6cd9bf398b1f3b6f2bf9a95d0e06288f26e8ed Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 10:24:19 -0300
Subject: [PATCH 10/51] Docs

---
 src/intrinsics.jl | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 77bbf2a3..9a271fa2 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -66,9 +66,28 @@ Declare memory that is local to a workgroup.
 localmemory(::Type{T}, dims) where T = localmemory(T, Val(dims))
 # @inline localmemory(::Type{T}, dims::Val{Dims}) where {T, Dims} = localmemory(T, dims, Val(gensym("static_shmem")))
 
+"""
+    barrier()
+
+After a `barrier()` call, all read and writes to global and local memory
+from each thread in the workgroup are visible in from all other threads in the
+workgroup.
+
+!!! note
+    `barrier()` must be encountered by all workitems of a work-group executing the kernel or by none at all.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override barrier()
+    ```
+    As well as the on-device functionality.
+"""
 function barrier()
     error("Group barrier used outside kernel or not captured")
 end
+
+# TODO
 function print end
 
 

From faa52139ac062e077e52c54c1144c1963d03bf64 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 10:27:40 -0300
Subject: [PATCH 11/51] New stuff docs

---
 src/intrinsics.jl | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 9a271fa2..efbe93f0 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -110,7 +110,51 @@ struct KIKernel{Backend, BKern}
     kern::BKern
 end
 
+"""
+    kernel_max_work_group_size(backend, kern; [max_work_items::Int])::Int
+
+The maximum workgroup size limit for a kernel as reported by the backend.
+This function should always be used to determine the workgroup size before
+launching a kernel.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    kernel_max_work_group_size(backend::NewBackend, kern::KIKernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int
+    ```
+    As well as the on-device functionality.
+"""
 function kernel_max_work_group_size end
+
+"""
+    max_work_group_size(backend, kern; [max_work_items::Int])::Int
+
+The maximum workgroup size limit for a kernel as reported by the backend.
+This function represents a theoretical maximum; `kernel_max_work_group_size`
+should be used before launching a kernel as some backends may error if
+kernel launch with too big a workgroup is attempted.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    max_work_group_size(backend::NewBackend)::Int
+    ```
+    As well as the on-device functionality.
+"""
 function max_work_group_size end
-function multiprocessor_count end
+
+"""
+    multiprocessor_count(backend::NewBackend)::Int
+
+The multiprocessor count for the current device used by `backend`.
+Used for certain algorithm optimizations.
+
+!!! note
+    Backend implementations **may** implement:
+    ```
+    multiprocessor_count(backend::NewBackend)::Int
+    ```
+    As well as the on-device functionality.
+"""
+multiprocessor_count(::Backend) = 0
 end

From 6baa445253f311dba4c7b3350e344bec20f052fd Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 10:36:26 -0300
Subject: [PATCH 12/51] Fix

---
 src/intrinsics.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index efbe93f0..5492d8f6 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -105,9 +105,9 @@ kernel on the host.
     ```
     As well as the on-device functionality.
 """
-struct KIKernel{Backend, BKern}
-    backend::Backend
-    kern::BKern
+struct KIKernel{B, Kern}
+    backend::B
+    kern::Kern
 end
 
 """
@@ -156,5 +156,5 @@ Used for certain algorithm optimizations.
     ```
     As well as the on-device functionality.
 """
-multiprocessor_count(::Backend) = 0
+multiprocessor_count(_) = 0
 end

From 1953e8d0a6af642948ebe94559a0cdc9e8698931 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 11:50:03 -0300
Subject: [PATCH 13/51] Temp adaptation to test in-progress interface with
 CUDA.jl

---
 src/intrinsics.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 5492d8f6..37cbba88 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -101,7 +101,7 @@ kernel on the host.
     Backend implementations **must** implement:
     ```
     KI.KIKernel(::NewBackend, f, args...; kwargs...)
-    (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing)
+    (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...)
     ```
     As well as the on-device functionality.
 """

From ba52bbb5c4ba167b14451921cfd8a55738d12fa4 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 4 Nov 2025 13:21:37 -0400
Subject: [PATCH 14/51] Better launch interface

---
 src/KernelAbstractions.jl | 10 ++---
 src/intrinsics.jl         | 88 ++++++++++++++++++++++++++++++++++++++-
 src/pocl/backend.jl       | 33 ++++++++++++++-
 3 files changed, 123 insertions(+), 8 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 7b626618..275ae740 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -194,6 +194,11 @@ function unsafe_free! end
 
 unsafe_free!(::AbstractArray) = return
 
+"""
+Abstract type for all KernelAbstractions backends.
+"""
+abstract type Backend end
+
 include("intrinsics.jl")
 import .KernelIntrinsics
 export KernelIntrinsics
@@ -500,11 +505,6 @@ constify(arg) = adapt(ConstAdaptor(), arg)
 # Backend hierarchy
 ###
 
-"""
-
-Abstract type for all KernelAbstractions backends.
-"""
-abstract type Backend end
 
 """
 Abstract type for all GPU based KernelAbstractions backends.
diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 37cbba88..01ab5ca2 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -1,5 +1,8 @@
 module KernelIntrinsics
 
+import ..KernelAbstractions: Backend
+import GPUCompiler: split_kwargs, assign_args!
+
 """
     get_global_size()::@NamedTuple{x::Int, y::Int, z::Int}
 
@@ -100,7 +103,6 @@ kernel on the host.
 !!! note
     Backend implementations **must** implement:
     ```
-    KI.KIKernel(::NewBackend, f, args...; kwargs...)
     (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...)
     ```
     As well as the on-device functionality.
@@ -157,4 +159,88 @@ Used for certain algorithm optimizations.
     As well as the on-device functionality.
 """
 multiprocessor_count(_) = 0
+
+# TODO: docstring
+# kiconvert(::NewBackend, arg)
+function kiconvert end
+
+# TODO: docstring
+# KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function kifunction end
+
+const MACRO_KWARGS = [:launch, :backend]
+const COMPILER_KWARGS = [:kernel, :name, :always_inline]
+const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize]
+
+macro kikernel(backend, ex...)
+    call = ex[end]
+    kwargs = map(ex[1:end-1]) do kwarg
+        if kwarg isa Symbol
+            :($kwarg = $kwarg)
+        elseif Meta.isexpr(kwarg, :(=))
+            kwarg
+        else
+            throw(ArgumentError("Invalid keyword argument '$kwarg'"))
+        end
+    end
+
+    # destructure the kernel call
+    Meta.isexpr(call, :call) || throw(ArgumentError("final argument to @kikern should be a function call"))
+    f = call.args[1]
+    args = call.args[2:end]
+
+    code = quote end
+    vars, var_exprs = assign_args!(code, args)
+
+    # group keyword argument
+    macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs =
+        split_kwargs(kwargs, MACRO_KWARGS, COMPILER_KWARGS, LAUNCH_KWARGS)
+    if !isempty(other_kwargs)
+        key,val = first(other_kwargs).args
+        throw(ArgumentError("Unsupported keyword argument '$key'"))
+    end
+
+    # handle keyword arguments that influence the macro's behavior
+    launch = true
+    for kwarg in macro_kwargs
+        key,val = kwarg.args
+        if key === :launch
+            isa(val, Bool) || throw(ArgumentError("`launch` keyword argument to @kikern should be a Bool"))
+            launch = val::Bool
+        else
+            throw(ArgumentError("Unsupported keyword argument '$key'"))
+        end
+    end
+    if !launch && !isempty(call_kwargs)
+        error("@kikern with launch=false does not support launch-time keyword arguments; use them when calling the kernel")
+    end
+
+    # FIXME: macro hygiene wrt. escaping kwarg values (this broke with 1.5)
+    #        we esc() the whole thing now, necessitating gensyms...
+    @gensym f_var kernel_f kernel_args kernel_tt kernel
+
+    # convert the arguments, call the compiler and launch the kernel
+    # while keeping the original arguments alive
+    push!(code.args,
+        quote
+            $f_var = $f
+            GC.@preserve $(vars...) $f_var begin
+                $kernel_f = $kiconvert($backend, $f_var)
+                $kernel_args = map(x -> $kiconvert($backend, x), ($(var_exprs...),))
+                $kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...}
+                $kernel = $kifunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...))
+                if $launch
+                    $kernel($(var_exprs...); $(call_kwargs...))
+                end
+                $kernel
+            end
+         end)
+
+    return esc(quote
+        let
+            $code
+        end
+    end)
+end
+
 end
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 4a50cc7e..b53bed69 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -2,7 +2,7 @@ module POCLKernels
 
 using ..POCL
 using ..POCL: @device_override, cl, method_table
-using ..POCL: device
+using ..POCL: device, clconvert, clfunction
 
 import KernelAbstractions as KA
 
@@ -138,9 +138,38 @@ function (obj::KA.Kernel{POCLBackend})(args...; ndrange = nothing, workgroupsize
     return nothing
 end
 
+const KI = KA.KernelIntrinsics
+
+KI.kiconvert(::POCLBackend, arg) = clconvert(arg)
+
+function KI.kifunction(::POCLBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+    kern = clfunction(f, tt; name, kwargs...)
+    KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern)
+end
+
+function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...)
+    local_size = isnothing(workgroupsize) ? 1 : workgroupsize
+    global_size = if isnothing(numworkgroups)
+        1
+    else
+        numworkgroups*local_size
+    end
+
+    obj.kern(args...; local_size, global_size)
+end
+
+
+function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int=typemax(Int))::Int
+    4096
+end
+function KI.max_work_group_size(::POCLBackend)::Int
+    4096
+end
+function KI.multiprocessor_count(::POCLBackend)::Int
+    1
+end
 
 ## Indexing Functions
-const KI = KA.KernelIntrinsics
 
 @device_override @inline function KI.get_local_id()
     return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3)))

From 1fb74a005cb0aec6f6a4ca9b4de180db0180bd65 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 4 Nov 2025 16:35:58 -0400
Subject: [PATCH 15/51] Only keep common compiler kwarg

---
 src/intrinsics.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 01ab5ca2..0ed4eb87 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -169,7 +169,7 @@ function kiconvert end
 function kifunction end
 
 const MACRO_KWARGS = [:launch, :backend]
-const COMPILER_KWARGS = [:kernel, :name, :always_inline]
+const COMPILER_KWARGS = [:name]
 const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize]
 
 macro kikernel(backend, ex...)

From 36c67ce660ec85900a02bad3fc103f3152e8e497 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 5 Nov 2025 12:24:34 -0400
Subject: [PATCH 16/51] Fixup tests

---
 test/intrinsics.jl | 26 +++++++++++++++++---------
 test/testsuite.jl  |  1 +
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index e58b58a6..537b29fb 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -1,12 +1,12 @@
 
-@kernel cpu = false inbounds = true unsafe_indices = true function test_intrinsics_kernel(results)
+function test_intrinsics_kernel(results)
     # Test all intrinsics return NamedTuples with x, y, z fields
-    global_size = KernelIntrinsics.get_global_size()
-    global_id = KernelIntrinsics.get_global_id()
-    local_size = KernelIntrinsics.get_local_size()
-    local_id = KernelIntrinsics.get_local_id()
-    num_groups = KernelIntrinsics.get_num_groups()
-    group_id = KernelIntrinsics.get_group_id()
+    global_size = KI.get_global_size()
+    global_id = KI.get_global_id()
+    local_size = KI.get_local_size()
+    local_id = KI.get_local_id()
+    num_groups = KI.get_num_groups()
+    group_id = KI.get_group_id()
 
     if UInt32(global_id.x) <= UInt32(global_size.x)
         results[1, global_id.x] = global_id.x
@@ -16,6 +16,7 @@
         results[5, global_id.x] = local_size.x
         results[6, global_id.x] = num_groups.x
     end
+    return
 end
 
 
@@ -23,12 +24,19 @@ function intrinsics_testsuite(backend, AT)
     @testset "KernelIntrinsics Tests" begin
         @testset "Basic intrinsics functionality" begin
 
+            @test KI.max_work_group_size(backend()) isa Int
+            @test KI.multiprocessor_count(backend()) isa Int
+
             # Test with small kernel
             N = 16
             results = AT(zeros(Int, 6, N))
 
-            kernel = test_intrinsics_kernel(backend(), 4, (N,))
-            kernel(results, ndrange = N)
+            kernel = KI.@kikernel backend() test_intrinsics_kernel(results)
+
+            @test KI.kernel_max_work_group_size(backend(), kernel) isa Int
+            @test KI.kernel_max_work_group_size(backend(), kernel; max_work_items=1) == 1
+
+            kernel(results, workgroupsize = 4, numworkgroups = 4)
             KernelAbstractions.synchronize(backend())
 
             host_results = Array(results)
diff --git a/test/testsuite.jl b/test/testsuite.jl
index 3426e542..31b801b3 100644
--- a/test/testsuite.jl
+++ b/test/testsuite.jl
@@ -1,6 +1,7 @@
 module Testsuite
 
 using ..KernelAbstractions
+import ..KernelAbstractions.KernelIntrinsics as KI
 using ..Test
 
 # We can't add test-dependencies withouth breaking backend packages

From 0636279c5b0e8d5756c192aa8a0da741c3144866 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 5 Nov 2025 12:25:48 -0400
Subject: [PATCH 17/51] No `backend` in macro kwargs

---
 src/intrinsics.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 0ed4eb87..cf3093a9 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -168,7 +168,7 @@ function kiconvert end
 # KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
 function kifunction end
 
-const MACRO_KWARGS = [:launch, :backend]
+const MACRO_KWARGS = [:launch]
 const COMPILER_KWARGS = [:name]
 const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize]
 

From 48c6b8b61458f809318b65617bdc8bf63cca8be2 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 5 Nov 2025 12:25:51 -0400
Subject: [PATCH 18/51] Tests

---
 src/intrinsics.jl | 52 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index cf3093a9..dfa80bfb 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -160,18 +160,62 @@ Used for certain algorithm optimizations.
 """
 multiprocessor_count(_) = 0
 
-# TODO: docstring
-# kiconvert(::NewBackend, arg)
+"""
+    kiconvert(::NewBackend, arg)
+
+This function is called for every argument to be passed to a kernel, allowing it to be
+converted to a GPU-friendly format.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    kiconvert(::NewBackend, arg)
+    ```
+"""
 function kiconvert end
 
-# TODO: docstring
-# KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+"""
+    KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+
+Low-level interface to compile a function invocation for the currently-active GPU, returning
+a callable kernel object. For a higher-level interface, use [`@kikernel`](@ref).
+
+Currently, only `kifunction` only supports the `name` keyword argument as it is the only one
+by all backends.
+
+Keyword arguments:
+- `name`: override the name that the kernel will have in the generated code
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+    ```
+"""
 function kifunction end
 
 const MACRO_KWARGS = [:launch]
 const COMPILER_KWARGS = [:name]
 const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize]
 
+"""
+    @kikernel backend workgroupsize=... numworkgroups=... [kwargs...] func(args...)
+
+High-level interface for executing code on a GPU.
+
+The `@kikernel` macro should prefix a call, with `func` a callable function or object that
+should return nothing. It will be compiled to a function native to the specified `backend`
+upon first use, and to a certain extent arguments will be converted and managed automatically
+using `kiconvert`. Finally, if `launch=true`, the newly created callable kernel object is
+called and launched according to the specified `backend`.
+
+There are a few keyword arguments that influence the behavior of `@kikernel`:
+
+- `launch`: whether to launch this kernel, defaults to `true`. If `false`, the returned
+  kernel object should be launched by calling it and passing arguments again.
+- `name`: the name of the kernel in the generated code. Defaults to an automatically-
+  generated name.
+"""
 macro kikernel(backend, ex...)
     call = ex[end]
     kwargs = map(ex[1:end-1]) do kwarg

From cc240c67f9d4866e304cde7964f6dcfc65c147b1 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 5 Nov 2025 17:33:47 -0400
Subject: [PATCH 19/51] Fixes

---
 src/KernelAbstractions.jl | 12 ++++++------
 src/pocl/backend.jl       |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 275ae740..67d53206 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -469,25 +469,25 @@ end
 # Internal kernel functions
 ###
 
-function __index_Local_Linear(ctx)
+@inline function __index_Local_Linear(ctx)
     return KernelIntrinsics.get_local_id().x
 end
 
-function __index_Group_Linear(ctx)
+@inline function __index_Group_Linear(ctx)
     return KernelIntrinsics.get_group_id().x
 end
 
-function __index_Global_Linear(ctx)
+@inline function __index_Global_Linear(ctx)
     return KernelIntrinsics.get_global_id().x
 end
 
-function __index_Local_Cartesian(ctx)
+@inline function __index_Local_Cartesian(ctx)
     return @inbounds workitems(__iterspace(ctx))[KernelIntrinsics.get_local_id().x]
 end
-function __index_Group_Cartesian(ctx)
+@inline function __index_Group_Cartesian(ctx)
     return @inbounds blocks(__iterspace(ctx))[KernelIntrinsics.get_group_id().x]
 end
-function __index_Global_Cartesian(ctx)
+@inline function __index_Global_Cartesian(ctx)
     return @inbounds expand(__iterspace(ctx), KernelIntrinsics.get_group_id().x, KernelIntrinsics.get_local_id().x)
 end
 
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index b53bed69..ac111687 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -147,7 +147,7 @@ function KI.kifunction(::POCLBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs
     KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern)
 end
 
-function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...)
+function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing)
     local_size = isnothing(workgroupsize) ? 1 : workgroupsize
     global_size = if isnothing(numworkgroups)
         1

From f41d1e015cc857c3e5b87b9c7c01d16c50e8dcc2 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 5 Nov 2025 18:34:35 -0400
Subject: [PATCH 20/51] More fix

---
 src/pocl/backend.jl | 9 +++++----
 test/intrinsics.jl  | 3 +--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index ac111687..46fd32df 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -56,7 +56,7 @@ KA.functional(::POCLBackend) = true
 KA.pagelock!(::POCLBackend, x) = nothing
 
 KA.get_backend(::Array) = POCLBackend()
-KA.synchronize(::POCLBackend) = nothing
+KA.synchronize(::POCLBackend) = cl.finish(cl.queue())
 KA.supports_float64(::POCLBackend) = true
 KA.supports_unified(::POCLBackend) = true
 
@@ -160,13 +160,14 @@ end
 
 
 function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int=typemax(Int))::Int
-    4096
+    wginfo = cl.work_group_info(kikern.kern.fun, device())
+    Int(min(wginfo.size, max_work_items))
 end
 function KI.max_work_group_size(::POCLBackend)::Int
-    4096
+    Int(device().max_work_group_size)
 end
 function KI.multiprocessor_count(::POCLBackend)::Int
-    1
+    Int(device().max_compute_units)
 end
 
 ## Indexing Functions
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index 537b29fb..cd75ad3f 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -30,8 +30,7 @@ function intrinsics_testsuite(backend, AT)
             # Test with small kernel
             N = 16
             results = AT(zeros(Int, 6, N))
-
-            kernel = KI.@kikernel backend() test_intrinsics_kernel(results)
+            kernel = KI.@kikernel backend() launch=false test_intrinsics_kernel(results)
 
             @test KI.kernel_max_work_group_size(backend(), kernel) isa Int
             @test KI.kernel_max_work_group_size(backend(), kernel; max_work_items=1) == 1

From 8e97be1aa76ed6066024b0b4849201466c1bfeac Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 5 Nov 2025 18:51:40 -0400
Subject: [PATCH 21/51] Print

---
 src/KernelAbstractions.jl | 18 +-----------------
 src/intrinsics.jl         | 19 ++++++++++++++++++-
 src/pocl/backend.jl       |  2 +-
 3 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 67d53206..c53f9989 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -819,23 +819,7 @@ SharedMemory(t::Type{T}, dims::Val{Dims}) where {T, Dims} = KernelIntrinsics.loc
 
 __synchronize() = KernelIntrinsics.barrier()
 
-@generated function __print(items...)
-    str = ""
-    args = []
-
-    for i in 1:length(items)
-        item = :(items[$i])
-        T = items[i]
-        if T <: Val
-            item = QuoteNode(T.parameters[1])
-        end
-        push!(args, item)
-    end
-
-    return quote
-        print($(args...))
-    end
-end
+__print(args...) = KernelIntrinsics._print(args...)
 
 # Utils
 __size(args::Tuple) = Tuple{args...}
diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index dfa80bfb..665b5903 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -91,7 +91,24 @@ function barrier()
 end
 
 # TODO
-function print end
+@generated function _print(items...)
+    str = ""
+    args = []
+
+    for i in 1:length(items)
+        item = :(items[$i])
+        T = items[i]
+        if T <: Val
+            item = QuoteNode(T.parameters[1])
+        end
+        push!(args, item)
+    end
+
+    return quote
+        print($(args...))
+    end
+end
+
 
 
 """
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 46fd32df..43b8d7b4 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -224,7 +224,7 @@ end
     work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE)
 end
 
-@device_override @inline function KA.__print(args...)
+@device_override @inline function KI._print(args...)
     POCL._print(args...)
 end
 

From bf37826bfc08b5aa53160b4b276c055f1f42cceb Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 5 Nov 2025 18:54:48 -0400
Subject: [PATCH 22/51] tweak

---
 src/pocl/backend.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 43b8d7b4..60e37bbc 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -5,6 +5,7 @@ using ..POCL: @device_override, cl, method_table
 using ..POCL: device, clconvert, clfunction
 
 import KernelAbstractions as KA
+import KA.KernelIntrinsics as KI
 
 import StaticArrays
 
@@ -138,8 +139,6 @@ function (obj::KA.Kernel{POCLBackend})(args...; ndrange = nothing, workgroupsize
     return nothing
 end
 
-const KI = KA.KernelIntrinsics
-
 KI.kiconvert(::POCLBackend, arg) = clconvert(arg)
 
 function KI.kifunction(::POCLBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}

From cae65fe412b3ec65b5f2ada87878505c640ff88c Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 5 Nov 2025 19:09:13 -0400
Subject: [PATCH 23/51] Fix

---
 src/pocl/backend.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 60e37bbc..e711600c 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -5,7 +5,7 @@ using ..POCL: @device_override, cl, method_table
 using ..POCL: device, clconvert, clfunction
 
 import KernelAbstractions as KA
-import KA.KernelIntrinsics as KI
+import KernekAbstractions.KernelIntrinsics as KI
 
 import StaticArrays
 

From 88616587fdb69a427189dc184898c631118db922 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 5 Nov 2025 19:14:43 -0400
Subject: [PATCH 24/51] Ugh

---
 src/pocl/backend.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index e711600c..b09be712 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -5,7 +5,7 @@ using ..POCL: @device_override, cl, method_table
 using ..POCL: device, clconvert, clfunction
 
 import KernelAbstractions as KA
-import KernekAbstractions.KernelIntrinsics as KI
+import KernelAbstractions.KernelIntrinsics as KI
 
 import StaticArrays
 

From 86c18fccb39338912c9c4de6b42f0a891cae5a34 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 5 Nov 2025 19:28:24 -0400
Subject: [PATCH 25/51] Format

---
 src/KernelAbstractions.jl |  5 +----
 src/intrinsics.jl         | 26 ++++++++++++++------------
 src/pocl/backend.jl       | 16 ++++++++--------
 test/intrinsics.jl        |  5 +++--
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index c53f9989..feaafa5b 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -248,11 +248,8 @@ end
 Declare storage that is local to a workgroup.
 """
 macro localmem(T, dims)
-    # Stay in sync with CUDAnative
-    # id = gensym("static_shmem")
-
     return quote
-        $SharedMemory($(esc(T)), Val($(esc(dims))))#, Val($(QuoteNode(id))))
+        $SharedMemory($(esc(T)), Val($(esc(dims))))
     end
 end
 
diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 665b5903..6db136ac 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -66,8 +66,7 @@ Declare memory that is local to a workgroup.
     ```
     As well as the on-device functionality.
 """
-localmemory(::Type{T}, dims) where T = localmemory(T, Val(dims))
-# @inline localmemory(::Type{T}, dims::Val{Dims}) where {T, Dims} = localmemory(T, dims, Val(gensym("static_shmem")))
+localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims))
 
 """
     barrier()
@@ -110,7 +109,6 @@ end
 end
 
 
-
 """
     KIKernel{Backend, BKern}
 
@@ -235,7 +233,7 @@ There are a few keyword arguments that influence the behavior of `@kikernel`:
 """
 macro kikernel(backend, ex...)
     call = ex[end]
-    kwargs = map(ex[1:end-1]) do kwarg
+    kwargs = map(ex[1:(end - 1)]) do kwarg
         if kwarg isa Symbol
             :($kwarg = $kwarg)
         elseif Meta.isexpr(kwarg, :(=))
@@ -257,14 +255,14 @@ macro kikernel(backend, ex...)
     macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs =
         split_kwargs(kwargs, MACRO_KWARGS, COMPILER_KWARGS, LAUNCH_KWARGS)
     if !isempty(other_kwargs)
-        key,val = first(other_kwargs).args
+        key, val = first(other_kwargs).args
         throw(ArgumentError("Unsupported keyword argument '$key'"))
     end
 
     # handle keyword arguments that influence the macro's behavior
     launch = true
     for kwarg in macro_kwargs
-        key,val = kwarg.args
+        key, val = kwarg.args
         if key === :launch
             isa(val, Bool) || throw(ArgumentError("`launch` keyword argument to @kikern should be a Bool"))
             launch = val::Bool
@@ -282,7 +280,8 @@ macro kikernel(backend, ex...)
 
     # convert the arguments, call the compiler and launch the kernel
     # while keeping the original arguments alive
-    push!(code.args,
+    push!(
+        code.args,
         quote
             $f_var = $f
             GC.@preserve $(vars...) $f_var begin
@@ -295,13 +294,16 @@ macro kikernel(backend, ex...)
                 end
                 $kernel
             end
-         end)
+        end
+    )
 
-    return esc(quote
-        let
-            $code
+    return esc(
+        quote
+            let
+                $code
+            end
         end
-    end)
+    )
 end
 
 end
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index b09be712..c38188b3 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -141,32 +141,32 @@ end
 
 KI.kiconvert(::POCLBackend, arg) = clconvert(arg)
 
-function KI.kifunction(::POCLBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name=nothing, kwargs...) where {F, TT}
     kern = clfunction(f, tt; name, kwargs...)
     KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern)
 end
 
-function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing)
+function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing)
     local_size = isnothing(workgroupsize) ? 1 : workgroupsize
     global_size = if isnothing(numworkgroups)
         1
     else
-        numworkgroups*local_size
+        numworkgroups * local_size
     end
 
-    obj.kern(args...; local_size, global_size)
+    return obj.kern(args...; local_size, global_size)
 end
 
 
-function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int=typemax(Int))::Int
+function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int
     wginfo = cl.work_group_info(kikern.kern.fun, device())
-    Int(min(wginfo.size, max_work_items))
+    return Int(min(wginfo.size, max_work_items))
 end
 function KI.max_work_group_size(::POCLBackend)::Int
-    Int(device().max_work_group_size)
+    return Int(device().max_work_group_size)
 end
 function KI.multiprocessor_count(::POCLBackend)::Int
-    Int(device().max_compute_units)
+    return Int(device().max_compute_units)
 end
 
 ## Indexing Functions
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index cd75ad3f..b3b95d50 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -30,10 +30,10 @@ function intrinsics_testsuite(backend, AT)
             # Test with small kernel
             N = 16
             results = AT(zeros(Int, 6, N))
-            kernel = KI.@kikernel backend() launch=false test_intrinsics_kernel(results)
+            kernel = KI.@kikernel backend() launch = false test_intrinsics_kernel(results)
 
             @test KI.kernel_max_work_group_size(backend(), kernel) isa Int
-            @test KI.kernel_max_work_group_size(backend(), kernel; max_work_items=1) == 1
+            @test KI.kernel_max_work_group_size(backend(), kernel; max_work_items = 1) == 1
 
             kernel(results, workgroupsize = 4, numworkgroups = 4)
             KernelAbstractions.synchronize(backend())
@@ -66,4 +66,5 @@ function intrinsics_testsuite(backend, AT)
             end
         end
     end
+    return nothing
 end

From 0a6f1720a946a37e7da78d4e46b1f3c032ae16f9 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 5 Nov 2025 19:30:02 -0400
Subject: [PATCH 26/51] Format

---
 src/pocl/backend.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index c38188b3..3aac42a5 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -141,9 +141,9 @@ end
 
 KI.kiconvert(::POCLBackend, arg) = clconvert(arg)
 
-function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name=nothing, kwargs...) where {F, TT}
+function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = clfunction(f, tt; name, kwargs...)
-    KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern)
+    return KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern)
 end
 
 function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing)

From 2371f884bfe527064c60d5683317dbfa0e585423 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 5 Nov 2025 19:32:24 -0400
Subject: [PATCH 27/51] drbh

---
 test/intrinsics.jl | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index b3b95d50..6fbf55c0 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -1,4 +1,3 @@
-
 function test_intrinsics_kernel(results)
     # Test all intrinsics return NamedTuples with x, y, z fields
     global_size = KI.get_global_size()
@@ -19,7 +18,6 @@ function test_intrinsics_kernel(results)
     return
 end
 
-
 function intrinsics_testsuite(backend, AT)
     @testset "KernelIntrinsics Tests" begin
         @testset "Basic intrinsics functionality" begin

From 0a45913f5ca0d69136bae016d1fb623dcdddbb00 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 5 Nov 2025 20:29:39 -0400
Subject: [PATCH 28/51] `_print` docstring

---
 src/intrinsics.jl | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 6db136ac..4eddf4a3 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -89,7 +89,20 @@ function barrier()
     error("Group barrier used outside kernel or not captured")
 end
 
-# TODO
+"""
+    _print(items...)
+
+    Overloaded by backends to enable `KernelAbstractions.@print`
+    functionality.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    _print(items...)
+    ```
+    As well as the on-device functionality,
+    or define it to return `nothing`
+"""
 @generated function _print(items...)
     str = ""
     args = []

From ff3b0777b0abd799fcd8c6105d221d789b03599d Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 6 Nov 2025 10:36:56 -0400
Subject: [PATCH 29/51] Test all launch size options

---
 test/intrinsics.jl | 50 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index 6fbf55c0..79996850 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -20,6 +20,56 @@ end
 
 function intrinsics_testsuite(backend, AT)
     @testset "KernelIntrinsics Tests" begin
+        @testset "Launch parameters" begin
+            # 1d
+            function launch_kernel1d(arr)
+                i, _, _ = KI.get_local_id()
+                gi, _, _ = KI.get_group_id()
+                ngi, _, _ = KI.get_num_groups()
+
+                arr[(gi - 1) * ngi + i] = 1f0
+                return
+            end
+            arr1d = AT(zeros(Float32, 4))
+            KI.@kikernel backend() numworkgroups = 2 workgroupsize = 2 launch_kernel1d(arr1d)
+            KernelAbstractions.synchronize(backend())
+            @test all(Array(arr1d) .== 1)
+
+            # 1d tuple
+            arr1dt = AT(zeros(Float32, 4))
+            KI.@kikernel backend() numworkgroups = (2,) workgroupsize = (2,) launch_kernel1d(arr1dt)
+            KernelAbstractions.synchronize(backend())
+            @test all(Array(arr1dt) .== 1)
+
+            # 2d
+            function launch_kernel2d(arr)
+                i, j, _ = KI.get_local_id()
+                gi, gj, _ = KI.get_group_id()
+                ngi, ngj, _ = KI.get_num_groups()
+
+                arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j] = 1f0
+                return
+            end
+            arr2d = AT(zeros(Float32, 4, 4))
+            KI.@kikernel backend() numworkgroups = (2, 2) workgroupsize = (2, 2) launch_kernel2d(arr2d)
+            KernelAbstractions.synchronize(backend())
+            @test all(Array(arr2d) .== 1)
+
+            # 3d
+            function launch_kernel3d(arr)
+                i, j, k = KI.get_local_id()
+                gi, gj, gk = KI.get_group_id()
+                ngi, ngj, ngk = KI.get_num_groups()
+
+                arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j, (gk - 1) * ngk + k] = 1f0
+                return
+            end
+            arr3d = AT(zeros(Float32, 4, 4, 4))
+            KI.@kikernel backend() numworkgroups = (2, 2, 2) workgroupsize = (2, 2, 2) launch_kernel3d(arr3d)
+            KernelAbstractions.synchronize(backend())
+            @test all(Array(arr3d) .== 1)
+        end
+
         @testset "Basic intrinsics functionality" begin
 
             @test KI.max_work_group_size(backend()) isa Int

From 0732ae094ebbc2724e7f2c87b4279dd693308249 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 6 Nov 2025 10:39:45 -0400
Subject: [PATCH 30/51] Fix backend

---
 src/pocl/backend.jl | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 3aac42a5..e66720ed 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -146,17 +146,23 @@ function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kw
     return KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern)
 end
 
-function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing)
-    local_size = isnothing(workgroupsize) ? 1 : workgroupsize
-    global_size = if isnothing(numworkgroups)
-        1
-    else
-        numworkgroups * local_size
+function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing)
+    local_size = StaticArrays.MVector{3}((1,1,1))
+    if !isnothing(workgroupsize)
+        for (i, val) in enumerate(workgroupsize)
+            local_size[i] = val
+        end
     end
 
-    return obj.kern(args...; local_size, global_size)
-end
+    global_size = StaticArrays.MVector{3}((1,1,1))
+    if !isnothing(numworkgroups)
+        for (i, val) in enumerate(numworkgroups)
+            global_size[i] = val * local_size[i]
+        end
+    end
 
+    obj.kern(args...; local_size, global_size)
+end
 
 function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int
     wginfo = cl.work_group_info(kikern.kern.fun, device())

From 7d6be1bcf9bcf70257ffb97ba7005f9644f449bf Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 6 Nov 2025 10:54:31 -0400
Subject: [PATCH 31/51] Format

---
 src/pocl/backend.jl | 8 ++++----
 test/intrinsics.jl  | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index e66720ed..41bbea58 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -146,22 +146,22 @@ function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kw
     return KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern)
 end
 
-function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing)
-    local_size = StaticArrays.MVector{3}((1,1,1))
+function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing)
+    local_size = StaticArrays.MVector{3}((1, 1, 1))
     if !isnothing(workgroupsize)
         for (i, val) in enumerate(workgroupsize)
             local_size[i] = val
         end
     end
 
-    global_size = StaticArrays.MVector{3}((1,1,1))
+    global_size = StaticArrays.MVector{3}((1, 1, 1))
     if !isnothing(numworkgroups)
         for (i, val) in enumerate(numworkgroups)
             global_size[i] = val * local_size[i]
         end
     end
 
-    obj.kern(args...; local_size, global_size)
+    return obj.kern(args...; local_size, global_size)
 end
 
 function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index 79996850..3ec26c56 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -27,7 +27,7 @@ function intrinsics_testsuite(backend, AT)
                 gi, _, _ = KI.get_group_id()
                 ngi, _, _ = KI.get_num_groups()
 
-                arr[(gi - 1) * ngi + i] = 1f0
+                arr[(gi - 1) * ngi + i] = 1.0f0
                 return
             end
             arr1d = AT(zeros(Float32, 4))
@@ -47,7 +47,7 @@ function intrinsics_testsuite(backend, AT)
                 gi, gj, _ = KI.get_group_id()
                 ngi, ngj, _ = KI.get_num_groups()
 
-                arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j] = 1f0
+                arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j] = 1.0f0
                 return
             end
             arr2d = AT(zeros(Float32, 4, 4))
@@ -61,7 +61,7 @@ function intrinsics_testsuite(backend, AT)
                 gi, gj, gk = KI.get_group_id()
                 ngi, ngj, ngk = KI.get_num_groups()
 
-                arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j, (gk - 1) * ngk + k] = 1f0
+                arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j, (gk - 1) * ngk + k] = 1.0f0
                 return
             end
             arr3d = AT(zeros(Float32, 4, 4, 4))

From 04f21ec918a8a9bb9ce2482bdd4d78d7458d85ab Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 6 Nov 2025 12:18:43 -0400
Subject: [PATCH 32/51] More consistent docstrings

---
 src/intrinsics.jl | 47 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 41 insertions(+), 6 deletions(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 4eddf4a3..fb258578 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -7,6 +7,12 @@ import GPUCompiler: split_kwargs, assign_args!
     get_global_size()::@NamedTuple{x::Int, y::Int, z::Int}
 
 Return the number of global work-items specified.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_global_size()::@NamedTuple{x::Int, y::Int, z::Int}
+    ```
 """
 function get_global_size end
 
@@ -17,6 +23,12 @@ Returns the unique global work-item ID.
 
 !!! note
     1-based.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_global_id()::@NamedTuple{x::Int, y::Int, z::Int}
+    ```
 """
 function get_global_id end
 
@@ -24,6 +36,12 @@ function get_global_id end
     get_local_size()::@NamedTuple{x::Int, y::Int, z::Int}
 
 Return the number of local work-items specified.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_local_size()::@NamedTuple{x::Int, y::Int, z::Int}
+    ```
 """
 function get_local_size end
 
@@ -34,6 +52,12 @@ Returns the unique local work-item ID.
 
 !!! note
     1-based.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_local_id()::@NamedTuple{x::Int, y::Int, z::Int}
+    ```
 """
 function get_local_id end
 
@@ -41,6 +65,12 @@ function get_local_id end
     get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int}
 
 Returns the number of groups.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int}
+    ```
 """
 function get_num_groups end
 
@@ -51,6 +81,12 @@ Returns the unique group ID.
 
 !!! note
     1-based.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_group_id()::@NamedTuple{x::Int, y::Int, z::Int}
+    ```
 """
 function get_group_id end
 
@@ -62,7 +98,7 @@ Declare memory that is local to a workgroup.
 !!! note
     Backend implementations **must** implement:
     ```
-    localmemory(T::DataType, ::Val{Dims}) where {T, Dims}
+    @device_override localmemory(T::DataType, ::Val{Dims}) where {T, Dims}
     ```
     As well as the on-device functionality.
 """
@@ -83,14 +119,13 @@ workgroup.
     ```
     @device_override barrier()
     ```
-    As well as the on-device functionality.
 """
 function barrier()
     error("Group barrier used outside kernel or not captured")
 end
 
 """
-    _print(items...)
+    _print(args...)
 
     Overloaded by backends to enable `KernelAbstractions.@print`
     functionality.
@@ -98,10 +133,10 @@ end
 !!! note
     Backend implementations **must** implement:
     ```
-    _print(items...)
+    @device_override _print(args...)
     ```
-    As well as the on-device functionality,
-    or define it to return `nothing`
+    If the backend does not support printing,
+    define it to return `nothing`.
 """
 @generated function _print(items...)
     str = ""

From d068b80405a483332a7afcf856d35ec5fde7263a Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 6 Nov 2025 14:05:19 -0400
Subject: [PATCH 33/51] Qualify `map`

---
 src/intrinsics.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index fb258578..a70370b0 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -334,8 +334,8 @@ macro kikernel(backend, ex...)
             $f_var = $f
             GC.@preserve $(vars...) $f_var begin
                 $kernel_f = $kiconvert($backend, $f_var)
-                $kernel_args = map(x -> $kiconvert($backend, x), ($(var_exprs...),))
-                $kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...}
+                $kernel_args = Base.map(x -> $kiconvert($backend, x), ($(var_exprs...),))
+                $kernel_tt = Tuple{Base.map(Core.Typeof, $kernel_args)...}
                 $kernel = $kifunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...))
                 if $launch
                     $kernel($(var_exprs...); $(call_kwargs...))

From 2fe4502e8418058f2f46a7d39f379204e0ddad68 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 6 Nov 2025 18:44:28 -0400
Subject: [PATCH 34/51] `KIKernel` -> `Kernel`

---
 src/intrinsics.jl   | 10 +++++-----
 src/pocl/backend.jl |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index a70370b0..5f48d338 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -158,19 +158,19 @@ end
 
 
 """
-    KIKernel{Backend, BKern}
+    Kernel{Backend, BKern}
 
-KIKernel closure struct that is used to represent the backend
+Kernel closure struct that is used to represent the backend
 kernel on the host.
 
 !!! note
     Backend implementations **must** implement:
     ```
-    (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...)
+    (kernel::Kernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...)
     ```
     As well as the on-device functionality.
 """
-struct KIKernel{B, Kern}
+struct Kernel{B, Kern}
     backend::B
     kern::Kern
 end
@@ -185,7 +185,7 @@ launching a kernel.
 !!! note
     Backend implementations **must** implement:
     ```
-    kernel_max_work_group_size(backend::NewBackend, kern::KIKernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int
+    kernel_max_work_group_size(backend::NewBackend, kern::Kernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int
     ```
     As well as the on-device functionality.
 """
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 41bbea58..7383a66f 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -143,10 +143,10 @@ KI.kiconvert(::POCLBackend, arg) = clconvert(arg)
 
 function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = clfunction(f, tt; name, kwargs...)
-    return KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern)
+    return KI.Kernel{POCLBackend, typeof(kern)}(POCLBackend(), kern)
 end
 
-function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing)
+function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing)
     local_size = StaticArrays.MVector{3}((1, 1, 1))
     if !isnothing(workgroupsize)
         for (i, val) in enumerate(workgroupsize)
@@ -164,7 +164,7 @@ function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workg
     return obj.kern(args...; local_size, global_size)
 end
 
-function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int
+function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int
     wginfo = cl.work_group_info(kikern.kern.fun, device())
     return Int(min(wginfo.size, max_work_items))
 end

From aaf20a7942d517aee62d561532675123f7b87ec3 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 6 Nov 2025 18:46:51 -0400
Subject: [PATCH 35/51] Remove redundant parameter

---
 src/intrinsics.jl   | 4 ++--
 src/pocl/backend.jl | 2 +-
 test/intrinsics.jl  | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 5f48d338..eda58ce4 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -176,7 +176,7 @@ struct Kernel{B, Kern}
 end
 
 """
-    kernel_max_work_group_size(backend, kern; [max_work_items::Int])::Int
+    kernel_max_work_group_size(kern; [max_work_items::Int])::Int
 
 The maximum workgroup size limit for a kernel as reported by the backend.
 This function should always be used to determine the workgroup size before
@@ -185,7 +185,7 @@ launching a kernel.
 !!! note
     Backend implementations **must** implement:
     ```
-    kernel_max_work_group_size(backend::NewBackend, kern::Kernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int
+    kernel_max_work_group_size(kern::Kernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int
     ```
     As well as the on-device functionality.
 """
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 7383a66f..08e5a76e 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -164,7 +164,7 @@ function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = nothing, workgro
     return obj.kern(args...; local_size, global_size)
 end
 
-function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int
+function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int
     wginfo = cl.work_group_info(kikern.kern.fun, device())
     return Int(min(wginfo.size, max_work_items))
 end
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index 3ec26c56..911e544f 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -80,8 +80,8 @@ function intrinsics_testsuite(backend, AT)
             results = AT(zeros(Int, 6, N))
             kernel = KI.@kikernel backend() launch = false test_intrinsics_kernel(results)
 
-            @test KI.kernel_max_work_group_size(backend(), kernel) isa Int
-            @test KI.kernel_max_work_group_size(backend(), kernel; max_work_items = 1) == 1
+            @test KI.kernel_max_work_group_size(kernel) isa Int
+            @test KI.kernel_max_work_group_size(kernel; max_work_items = 1) == 1
 
             kernel(results, workgroupsize = 4, numworkgroups = 4)
             KernelAbstractions.synchronize(backend())

From e2941fbcb0c21a9cea78b40fb526deba07a8f8a9 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 6 Nov 2025 19:13:27 -0400
Subject: [PATCH 36/51] `kiconvert` -> `argconvert`

---
 src/intrinsics.jl   | 12 ++++++------
 src/pocl/backend.jl |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index eda58ce4..ad2c8f6c 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -224,7 +224,7 @@ Used for certain algorithm optimizations.
 multiprocessor_count(_) = 0
 
 """
-    kiconvert(::NewBackend, arg)
+    argconvert(::NewBackend, arg)
 
 This function is called for every argument to be passed to a kernel, allowing it to be
 converted to a GPU-friendly format.
@@ -232,10 +232,10 @@ converted to a GPU-friendly format.
 !!! note
     Backend implementations **must** implement:
     ```
-    kiconvert(::NewBackend, arg)
+    argconvert(::NewBackend, arg)
     ```
 """
-function kiconvert end
+function argconvert end
 
 """
     KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
@@ -269,7 +269,7 @@ High-level interface for executing code on a GPU.
 The `@kikernel` macro should prefix a call, with `func` a callable function or object that
 should return nothing. It will be compiled to a function native to the specified `backend`
 upon first use, and to a certain extent arguments will be converted and managed automatically
-using `kiconvert`. Finally, if `launch=true`, the newly created callable kernel object is
+using `argconvert`. Finally, if `launch=true`, the newly created callable kernel object is
 called and launched according to the specified `backend`.
 
 There are a few keyword arguments that influence the behavior of `@kikernel`:
@@ -333,8 +333,8 @@ macro kikernel(backend, ex...)
         quote
             $f_var = $f
             GC.@preserve $(vars...) $f_var begin
-                $kernel_f = $kiconvert($backend, $f_var)
-                $kernel_args = Base.map(x -> $kiconvert($backend, x), ($(var_exprs...),))
+                $kernel_f = $argconvert($backend, $f_var)
+                $kernel_args = Base.map(x -> $argconvert($backend, x), ($(var_exprs...),))
                 $kernel_tt = Tuple{Base.map(Core.Typeof, $kernel_args)...}
                 $kernel = $kifunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...))
                 if $launch
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 08e5a76e..f5226bc7 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -139,7 +139,7 @@ function (obj::KA.Kernel{POCLBackend})(args...; ndrange = nothing, workgroupsize
     return nothing
 end
 
-KI.kiconvert(::POCLBackend, arg) = clconvert(arg)
+KI.argconvert(::POCLBackend, arg) = clconvert(arg)
 
 function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = clfunction(f, tt; name, kwargs...)

From 5262c9e6fbd6f05332dfbb45037e737693a94e52 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 6 Nov 2025 19:16:05 -0400
Subject: [PATCH 37/51] Remove old definition

---
 src/intrinsics.jl | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index ad2c8f6c..172d1de1 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -138,23 +138,8 @@ end
     If the backend does not support printing,
     define it to return `nothing`.
 """
-@generated function _print(items...)
-    str = ""
-    args = []
-
-    for i in 1:length(items)
-        item = :(items[$i])
-        T = items[i]
-        if T <: Val
-            item = QuoteNode(T.parameters[1])
-        end
-        push!(args, item)
-    end
+function _print end
 
-    return quote
-        print($(args...))
-    end
-end
 
 
 """

From ca11482c526749b4c47ad3e23068ff8daed8c673 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Thu, 6 Nov 2025 19:28:48 -0400
Subject: [PATCH 38/51] Unbreak ABI

---
 src/KernelAbstractions.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index feaafa5b..95e71b12 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -248,8 +248,11 @@ end
 Declare storage that is local to a workgroup.
 """
 macro localmem(T, dims)
+    # Stay in sync with CUDAnative
+    id = gensym("static_shmem")
+
     return quote
-        $SharedMemory($(esc(T)), Val($(esc(dims))))
+        $SharedMemory($(esc(T)), Val($(esc(dims))), Val($(QuoteNode(id))))
     end
 end
 
@@ -811,8 +814,7 @@ include("macros.jl")
 ###
 
 function Scratchpad end
-# SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id)
-SharedMemory(t::Type{T}, dims::Val{Dims}) where {T, Dims} = KernelIntrinsics.localmemory(t, dims)
+SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims)
 
 __synchronize() = KernelIntrinsics.barrier()
 

From d6f3e66dad2ce09fdcba1b0d2256fb4fd7e8dd9c Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 7 Nov 2025 09:05:50 -0400
Subject: [PATCH 39/51] Readd old definition

---
 src/KernelAbstractions.jl | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 95e71b12..fb48fd7b 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -369,6 +369,25 @@ macro context()
     return esc(:(__ctx__))
 end
 
+# Defined to keep cpu support for `__print`
+@generated function KernelIntrinsics._print(items...)
+    str = ""
+    args = []
+
+    for i in 1:length(items)
+        item = :(items[$i])
+        T = items[i]
+        if T <: Val
+            item = QuoteNode(T.parameters[1])
+        end
+        push!(args, item)
+    end
+
+    return quote
+        print($(args...))
+    end
+end
+
 """
     @print(items...)
 

From 1400c9f061110e5e6732d66d7dfea2cf7914977e Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 7 Nov 2025 09:15:21 -0400
Subject: [PATCH 40/51] Naming

---
 src/intrinsics.jl   | 12 ++++++------
 src/pocl/backend.jl |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 172d1de1..5904d3a5 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -143,7 +143,7 @@ function _print end
 
 
 """
-    Kernel{Backend, BKern}
+    Kernel{Backend, Kern}
 
 Kernel closure struct that is used to represent the backend
 kernel on the host.
@@ -223,12 +223,12 @@ converted to a GPU-friendly format.
 function argconvert end
 
 """
-    KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+    KI.gpufunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
 
 Low-level interface to compile a function invocation for the currently-active GPU, returning
 a callable kernel object. For a higher-level interface, use [`@kikernel`](@ref).
 
-Currently, only `kifunction` only supports the `name` keyword argument as it is the only one
+Currently, only `gpufunction` only supports the `name` keyword argument as it is the only one
 by all backends.
 
 Keyword arguments:
@@ -237,10 +237,10 @@ Keyword arguments:
 !!! note
     Backend implementations **must** implement:
     ```
-    kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+    gpufunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
     ```
 """
-function kifunction end
+function gpufunction end
 
 const MACRO_KWARGS = [:launch]
 const COMPILER_KWARGS = [:name]
@@ -321,7 +321,7 @@ macro kikernel(backend, ex...)
                 $kernel_f = $argconvert($backend, $f_var)
                 $kernel_args = Base.map(x -> $argconvert($backend, x), ($(var_exprs...),))
                 $kernel_tt = Tuple{Base.map(Core.Typeof, $kernel_args)...}
-                $kernel = $kifunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...))
+                $kernel = $gpufunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...))
                 if $launch
                     $kernel($(var_exprs...); $(call_kwargs...))
                 end
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index f5226bc7..913fbe37 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -141,7 +141,7 @@ end
 
 KI.argconvert(::POCLBackend, arg) = clconvert(arg)
 
-function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
+function KI.gpufunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = clfunction(f, tt; name, kwargs...)
     return KI.Kernel{POCLBackend, typeof(kern)}(POCLBackend(), kern)
 end
@@ -164,8 +164,8 @@ function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = nothing, workgro
     return obj.kern(args...; local_size, global_size)
 end
 
-function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int
-    wginfo = cl.work_group_info(kikern.kern.fun, device())
+function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int
+    wginfo = cl.work_group_info(kernel.kern.fun, device())
     return Int(min(wginfo.size, max_work_items))
 end
 function KI.max_work_group_size(::POCLBackend)::Int

From 055a81f04ec9d358e78b381a07b293710c5b48d0 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 7 Nov 2025 09:35:34 -0400
Subject: [PATCH 41/51] Reword

---
 src/intrinsics.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 5904d3a5..5960033c 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -211,8 +211,8 @@ multiprocessor_count(_) = 0
 """
     argconvert(::NewBackend, arg)
 
-This function is called for every argument to be passed to a kernel, allowing it to be
-converted to a GPU-friendly format.
+This function is called for every argument to be passed to a kernel,
+converting them to their device side representation.
 
 !!! note
     Backend implementations **must** implement:

From 913f8c8b048e0d72065f9243df5a20784f08972f Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 7 Nov 2025 13:49:35 -0400
Subject: [PATCH 42/51] Docstrings

---
 src/intrinsics.jl | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 5960033c..fb1b00ad 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -1,3 +1,15 @@
+"""
+# KernelIntrinics
+
+The `KernelIntrinics` module defines the API interface for backends to define various lower-level device and
+host-side functionality. The `KernelIntrinsics` intrinsics are used to define the higher-level device-side
+intrinsics functionality in `KernelAbstractions`.
+
+Both provide APIs for host and device-side functionality, but `KernelIntrinsics` focuses on on lower-level
+functionality that is shared amongst backends, while `KernelAbstractions` provides higher-level functionality
+such as writing kernels that work on arrays with an arbitrary number of dimensions, or convenience functions
+like allocating arrays on a backend.
+"""
 module KernelIntrinsics
 
 import ..KernelAbstractions: Backend
@@ -111,6 +123,9 @@ After a `barrier()` call, all read and writes to global and local memory
 from each thread in the workgroup are visible in from all other threads in the
 workgroup.
 
+This does **not** guarantee that a write from a thread in a certain workgroup will
+be visible to a thread in a different workgroup.
+
 !!! note
     `barrier()` must be encountered by all workitems of a work-group executing the kernel or by none at all.
 

From 298e8078888ece5f2af69deb36ee0692ef0cb4a6 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 7 Nov 2025 14:03:50 -0400
Subject: [PATCH 43/51] Rename `kikernel`

---
 src/intrinsics.jl  | 16 +++++++++++-----
 test/intrinsics.jl | 10 +++++-----
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index fb1b00ad..cd94d473 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -241,7 +241,7 @@ function argconvert end
     KI.gpufunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
 
 Low-level interface to compile a function invocation for the currently-active GPU, returning
-a callable kernel object. For a higher-level interface, use [`@kikernel`](@ref).
+a callable kernel object. For a higher-level interface, use [`KernelIntrinsics.@kernel`](@ref).
 
 Currently, only `gpufunction` only supports the `name` keyword argument as it is the only one
 by all backends.
@@ -262,24 +262,30 @@ const COMPILER_KWARGS = [:name]
 const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize]
 
 """
-    @kikernel backend workgroupsize=... numworkgroups=... [kwargs...] func(args...)
+    KernelIntrinsics.@kernel backend workgroupsize=... numworkgroups=... [kwargs...] func(args...)
 
 High-level interface for executing code on a GPU.
 
-The `@kikernel` macro should prefix a call, with `func` a callable function or object that
+The `KernelIntrinsics.@kernel` macro should prefix a call, with `func` a callable function or object that
 should return nothing. It will be compiled to a function native to the specified `backend`
 upon first use, and to a certain extent arguments will be converted and managed automatically
 using `argconvert`. Finally, if `launch=true`, the newly created callable kernel object is
 called and launched according to the specified `backend`.
 
-There are a few keyword arguments that influence the behavior of `@kikernel`:
+There are a few keyword arguments that influence the behavior of `KernelIntrinsics.@kernel`:
 
 - `launch`: whether to launch this kernel, defaults to `true`. If `false`, the returned
   kernel object should be launched by calling it and passing arguments again.
 - `name`: the name of the kernel in the generated code. Defaults to an automatically-
   generated name.
+
+!!! note
+    `KernelIntrinsics.@kernel` differs from the `KernelAbstractions` macro in that this macro acts
+    a wrapper around backend kernel compilation/launching (such as `@cuda`, `@metal`, etc.). It is
+    used when calling a function to be run on a specific backend, while `KernelAbstractions.@kernel`
+    is used kernel definition for use with the original higher-level `KernelAbstractions` API.
 """
-macro kikernel(backend, ex...)
+macro kernel(backend, ex...)
     call = ex[end]
     kwargs = map(ex[1:(end - 1)]) do kwarg
         if kwarg isa Symbol
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index 911e544f..7a7ec1e0 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -31,13 +31,13 @@ function intrinsics_testsuite(backend, AT)
                 return
             end
             arr1d = AT(zeros(Float32, 4))
-            KI.@kikernel backend() numworkgroups = 2 workgroupsize = 2 launch_kernel1d(arr1d)
+            KI.@kernel backend() numworkgroups = 2 workgroupsize = 2 launch_kernel1d(arr1d)
             KernelAbstractions.synchronize(backend())
             @test all(Array(arr1d) .== 1)
 
             # 1d tuple
             arr1dt = AT(zeros(Float32, 4))
-            KI.@kikernel backend() numworkgroups = (2,) workgroupsize = (2,) launch_kernel1d(arr1dt)
+            KI.@kernel backend() numworkgroups = (2,) workgroupsize = (2,) launch_kernel1d(arr1dt)
             KernelAbstractions.synchronize(backend())
             @test all(Array(arr1dt) .== 1)
 
@@ -51,7 +51,7 @@ function intrinsics_testsuite(backend, AT)
                 return
             end
             arr2d = AT(zeros(Float32, 4, 4))
-            KI.@kikernel backend() numworkgroups = (2, 2) workgroupsize = (2, 2) launch_kernel2d(arr2d)
+            KI.@kernel backend() numworkgroups = (2, 2) workgroupsize = (2, 2) launch_kernel2d(arr2d)
             KernelAbstractions.synchronize(backend())
             @test all(Array(arr2d) .== 1)
 
@@ -65,7 +65,7 @@ function intrinsics_testsuite(backend, AT)
                 return
             end
             arr3d = AT(zeros(Float32, 4, 4, 4))
-            KI.@kikernel backend() numworkgroups = (2, 2, 2) workgroupsize = (2, 2, 2) launch_kernel3d(arr3d)
+            KI.@kernel backend() numworkgroups = (2, 2, 2) workgroupsize = (2, 2, 2) launch_kernel3d(arr3d)
             KernelAbstractions.synchronize(backend())
             @test all(Array(arr3d) .== 1)
         end
@@ -78,7 +78,7 @@ function intrinsics_testsuite(backend, AT)
             # Test with small kernel
             N = 16
             results = AT(zeros(Int, 6, N))
-            kernel = KI.@kikernel backend() launch = false test_intrinsics_kernel(results)
+            kernel = KI.@kernel backend() launch = false test_intrinsics_kernel(results)
 
             @test KI.kernel_max_work_group_size(kernel) isa Int
             @test KI.kernel_max_work_group_size(kernel; max_work_items = 1) == 1

From 42f17d6ee9766662732eb0de896d1c0ff5b37ad4 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 7 Nov 2025 17:17:29 -0400
Subject: [PATCH 44/51] Renaming

---
 src/KernelAbstractions.jl | 24 ++++++++++++------------
 src/intrinsics.jl         | 31 +++++++++++++++++--------------
 src/pocl/backend.jl       |  2 +-
 3 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index fb48fd7b..a0f57c74 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -200,8 +200,8 @@ Abstract type for all KernelAbstractions backends.
 abstract type Backend end
 
 include("intrinsics.jl")
-import .KernelIntrinsics
-export KernelIntrinsics
+import .KernelIntrinsics: KernelIntrinsics, KI
+export KernelIntrinsics, KI
 
 ###
 # Kernel language
@@ -370,7 +370,7 @@ macro context()
 end
 
 # Defined to keep cpu support for `__print`
-@generated function KernelIntrinsics._print(items...)
+@generated function KI._print(items...)
     str = ""
     args = []
 
@@ -489,25 +489,25 @@ end
 ###
 
 @inline function __index_Local_Linear(ctx)
-    return KernelIntrinsics.get_local_id().x
+    return KI.get_local_id().x
 end
 
 @inline function __index_Group_Linear(ctx)
-    return KernelIntrinsics.get_group_id().x
+    return KI.get_group_id().x
 end
 
 @inline function __index_Global_Linear(ctx)
-    return KernelIntrinsics.get_global_id().x
+    return KI.get_global_id().x
 end
 
 @inline function __index_Local_Cartesian(ctx)
-    return @inbounds workitems(__iterspace(ctx))[KernelIntrinsics.get_local_id().x]
+    return @inbounds workitems(__iterspace(ctx))[KI.get_local_id().x]
 end
 @inline function __index_Group_Cartesian(ctx)
-    return @inbounds blocks(__iterspace(ctx))[KernelIntrinsics.get_group_id().x]
+    return @inbounds blocks(__iterspace(ctx))[KI.get_group_id().x]
 end
 @inline function __index_Global_Cartesian(ctx)
-    return @inbounds expand(__iterspace(ctx), KernelIntrinsics.get_group_id().x, KernelIntrinsics.get_local_id().x)
+    return @inbounds expand(__iterspace(ctx), KI.get_group_id().x, KI.get_local_id().x)
 end
 
 @inline __index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...))
@@ -833,11 +833,11 @@ include("macros.jl")
 ###
 
 function Scratchpad end
-SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims)
+SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(t, dims)
 
-__synchronize() = KernelIntrinsics.barrier()
+__synchronize() = KI.barrier()
 
-__print(args...) = KernelIntrinsics._print(args...)
+__print(args...) = KI._print(args...)
 
 # Utils
 __size(args::Tuple) = Tuple{args...}
diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index cd94d473..e0b2c46a 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -1,17 +1,20 @@
 """
-# KernelIntrinics
+# `KernelIntrinics`/`KI`
 
-The `KernelIntrinics` module defines the API interface for backends to define various lower-level device and
-host-side functionality. The `KernelIntrinsics` intrinsics are used to define the higher-level device-side
+The `KernelIntrinics` (or `KI`) module defines the API interface for backends to define various lower-level device and
+host-side functionality. The `KI` intrinsics are used to define the higher-level device-side
 intrinsics functionality in `KernelAbstractions`.
 
-Both provide APIs for host and device-side functionality, but `KernelIntrinsics` focuses on on lower-level
+Both provide APIs for host and device-side functionality, but `KI` focuses on on lower-level
 functionality that is shared amongst backends, while `KernelAbstractions` provides higher-level functionality
 such as writing kernels that work on arrays with an arbitrary number of dimensions, or convenience functions
 like allocating arrays on a backend.
 """
 module KernelIntrinsics
 
+const KI = KernelIntrinsics
+export KI
+
 import ..KernelAbstractions: Backend
 import GPUCompiler: split_kwargs, assign_args!
 
@@ -238,12 +241,12 @@ converting them to their device side representation.
 function argconvert end
 
 """
-    KI.gpufunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+    KI.kernel_function(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
 
 Low-level interface to compile a function invocation for the currently-active GPU, returning
-a callable kernel object. For a higher-level interface, use [`KernelIntrinsics.@kernel`](@ref).
+a callable kernel object. For a higher-level interface, use [`KI.@kernel`](@ref).
 
-Currently, only `gpufunction` only supports the `name` keyword argument as it is the only one
+Currently, `kernel_function` only supports the `name` keyword argument as it is the only one
 by all backends.
 
 Keyword arguments:
@@ -252,27 +255,27 @@ Keyword arguments:
 !!! note
     Backend implementations **must** implement:
     ```
-    gpufunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+    kernel_function(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
     ```
 """
-function gpufunction end
+function kernel_function end
 
 const MACRO_KWARGS = [:launch]
 const COMPILER_KWARGS = [:name]
 const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize]
 
 """
-    KernelIntrinsics.@kernel backend workgroupsize=... numworkgroups=... [kwargs...] func(args...)
+    KI.@kernel backend workgroupsize=... numworkgroups=... [kwargs...] func(args...)
 
 High-level interface for executing code on a GPU.
 
-The `KernelIntrinsics.@kernel` macro should prefix a call, with `func` a callable function or object that
+The `KI.@kernel` macro should prefix a call, with `func` a callable function or object that
 should return nothing. It will be compiled to a function native to the specified `backend`
 upon first use, and to a certain extent arguments will be converted and managed automatically
 using `argconvert`. Finally, if `launch=true`, the newly created callable kernel object is
 called and launched according to the specified `backend`.
 
-There are a few keyword arguments that influence the behavior of `KernelIntrinsics.@kernel`:
+There are a few keyword arguments that influence the behavior of `KI.@kernel`:
 
 - `launch`: whether to launch this kernel, defaults to `true`. If `false`, the returned
   kernel object should be launched by calling it and passing arguments again.
@@ -280,7 +283,7 @@ There are a few keyword arguments that influence the behavior of `KernelIntrinsi
   generated name.
 
 !!! note
-    `KernelIntrinsics.@kernel` differs from the `KernelAbstractions` macro in that this macro acts
+    `KI.@kernel` differs from the `KernelAbstractions` macro in that this macro acts
     a wrapper around backend kernel compilation/launching (such as `@cuda`, `@metal`, etc.). It is
     used when calling a function to be run on a specific backend, while `KernelAbstractions.@kernel`
     is used kernel definition for use with the original higher-level `KernelAbstractions` API.
@@ -342,7 +345,7 @@ macro kernel(backend, ex...)
                 $kernel_f = $argconvert($backend, $f_var)
                 $kernel_args = Base.map(x -> $argconvert($backend, x), ($(var_exprs...),))
                 $kernel_tt = Tuple{Base.map(Core.Typeof, $kernel_args)...}
-                $kernel = $gpufunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...))
+                $kernel = $kernel_function($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...))
                 if $launch
                     $kernel($(var_exprs...); $(call_kwargs...))
                 end
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 913fbe37..14fe6ae8 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -141,7 +141,7 @@ end
 
 KI.argconvert(::POCLBackend, arg) = clconvert(arg)
 
-function KI.gpufunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
+function KI.kernel_function(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = clfunction(f, tt; name, kwargs...)
     return KI.Kernel{POCLBackend, typeof(kern)}(POCLBackend(), kern)
 end

From da98ac896fad002655b2d62d8bafacd57f5778ed Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 11 Nov 2025 20:02:34 -0400
Subject: [PATCH 45/51] Update `unsafe_indices` examples to use KI directly

---
 examples/histogram.jl         | 20 ++++++------
 examples/performant_matmul.jl | 61 +++++++++++++++++------------------
 2 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/examples/histogram.jl b/examples/histogram.jl
index 9edc0293..39f243c5 100644
--- a/examples/histogram.jl
+++ b/examples/histogram.jl
@@ -12,16 +12,15 @@ function create_histogram(input)
     return histogram_output
 end
 
-# This a 1D histogram kernel where the histogramming happens on shmem
-@kernel unsafe_indices = true function histogram_kernel!(histogram_output, input)
-    gid = @index(Group, Linear)
-    lid = @index(Local, Linear)
+# This a 1D histogram kernel where the histogramming happens on static shmem
+function histogram_kernel!(histogram_output, input, ::Val{gs}) where gs
+    gid = KI.get_group_id().x
+    lid = KI.get_local_id().x
 
-    @uniform gs = prod(@groupsize())
     tid = (gid - 1) * gs + lid
-    @uniform N = length(histogram_output)
+    N = length(histogram_output)
 
-    shared_histogram = @localmem eltype(input) (gs)
+    shared_histogram = KI.localmemory(eltype(input), gs)
 
     # This will go through all input elements and assign them to a location in
     # shmem. Note that if there is not enough shem, we create different shmem
@@ -32,7 +31,7 @@ end
 
         # Setting shared_histogram to 0
         @inbounds shared_histogram[lid] = 0
-        @synchronize()
+        KI.barrier()
 
         max_element = min_element + gs
         if max_element > N
@@ -46,7 +45,7 @@ end
             @atomic shared_histogram[bin] += 1
         end
 
-        @synchronize()
+        KI.barrier()
 
         if ((lid + min_element - 1) <= N)
             @atomic histogram_output[lid + min_element - 1] += shared_histogram[lid]
@@ -59,8 +58,7 @@ end
 function histogram!(histogram_output, input, groupsize = 256)
     backend = get_backend(histogram_output)
     # Need static block size
-    kernel! = histogram_kernel!(backend, (groupsize,))
-    kernel!(histogram_output, input, ndrange = size(input))
+    KI.@kernel backend workgroupsize=groupsize numworkgroups=cld(length(input), groupsize) histogram_kernel!(histogram_output, input, Val(groupsize))
     return
 end
 
diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl
index ac56edb5..f19fe75b 100644
--- a/examples/performant_matmul.jl
+++ b/examples/performant_matmul.jl
@@ -9,70 +9,68 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) #
 #  Metal sometimes supports fewer.
 const TILE_DIM = 16
 
-@kernel unsafe_indices = true function coalesced_matmul_kernel!(
-        output, @Const(input1), @Const(input2), N, R, M,
-        ::Val{BANK} = Val(1),
-    ) where {BANK}
-    gi, gj = @index(Group, NTuple)
-    i, j = @index(Local, NTuple)
-
-    TILE_DIM = @uniform @groupsize()[1]
+function coalesced_matmul_kernel!(
+        output, input1, input2, N, R, M,
+        ::Val{TDIM}, ::Val{BANK} = Val(1)
+    ) where {TDIM, BANK}
+    gi, gj, _ = KI.get_group_id()
+    i, j, _ = KI.get_local_id()
 
     # +1 to avoid bank conflicts on shared memory
-    tile1 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
-    tile2 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
+    tile1 = KI.localmemory(eltype(output), (TDIM + BANK, TDIM))
+    tile2 = KI.localmemory(eltype(output), (TDIM + BANK, TDIM))
 
-    # private variable for tile output
-    outval = @private eltype(output) 1
-    @inbounds outval[1] = -zero(eltype(output))
+    # variable for tile output
+    outval = -zero(eltype(output))
 
-    @uniform N = size(output, 1)
+    N = size(output, 1)
     # number of tiles depends on inner dimension
-    @uniform NUM_TILES = div(R + TILE_DIM - 1, TILE_DIM)
+    NUM_TILES = div(R + TDIM - 1, TDIM)
 
     # loop over all tiles needed for this calculation
     for t in 0:(NUM_TILES - 1)
         # Can't use @index(Global), because we use a smaller ndrange
-        I = (gi - 1) * TILE_DIM + i
-        J = (gj - 1) * TILE_DIM + j
+        I = (gi - 1) * TDIM + i
+        J = (gj - 1) * TDIM + j
 
         # load inputs into tiles, with bounds checking for non-square matrices
-        if I <= N && t * TILE_DIM + j <= R
-            @inbounds tile1[i, j] = input1[I, t * TILE_DIM + j]
+        if I <= N && t * TDIM + j <= R
+            @inbounds tile1[i, j] = input1[I, t * TDIM + j]
         else
             @inbounds tile1[i, j] = 0.0
         end
         if t * TILE_DIM + i <= R && J <= M
-            @inbounds tile2[i, j] = input2[t * TILE_DIM + i, J]
+            @inbounds tile2[i, j] = input2[t * TDIM + i, J]
         else
             @inbounds tile2[i, j] = 0.0
         end
 
         # wait for all tiles to be loaded
-        @synchronize
+        KI.barrier()
 
         # get global values again
-        I = (gi - 1) * TILE_DIM + i
-        J = (gj - 1) * TILE_DIM + j
+        I = (gi - 1) * TDIM + i
+        J = (gj - 1) * TDIM + j
 
         # calculate value of spot in output, use temporary value to allow for vectorization
         out = zero(eltype(output))
-        @simd for k in 1:TILE_DIM
+        @simd for k in 1:TDIM
             @inbounds out += tile1[i, k] * tile2[k, j]
         end
-        outval[1] += out
+        outval += out
 
-        @synchronize
+        KI.barrier()
     end
 
     # get global indices again
-    I = (gi - 1) * TILE_DIM + i
-    J = (gj - 1) * TILE_DIM + j
+    I = (gi - 1) * TDIM + i
+    J = (gj - 1) * TDIM + j
 
     # save if inbounds
     if I <= N && J <= M
-        @inbounds output[I, J] = outval[1]
+        @inbounds output[I, J] = outval
     end
+    return nothing
 end
 
 N = 1024
@@ -82,9 +80,10 @@ A = rand!(allocate(backend, Float32, N, R))
 B = rand!(allocate(backend, Float32, R, M))
 C = KernelAbstractions.zeros(backend, Float32, N, M)
 
-kern = coalesced_matmul_kernel!(backend, (TILE_DIM, TILE_DIM))
+workgroupsize=(TILE_DIM, TILE_DIM)
+numworkgroups=(cld(size(C,1), TILE_DIM), cld(size(C,2), TILE_DIM))
 
-kern(C, A, B, N, R, M, ndrange = size(C))
+KI.@kernel backend workgroupsize numworkgroups coalesced_matmul_kernel!(C, A, B, N, R, M, Val(TILE_DIM))
 KernelAbstractions.synchronize(backend)
 
 @test isapprox(A * B, C)

From 8510dfe46a1888c2362c045176b16f4451376abe Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 11 Nov 2025 20:08:22 -0400
Subject: [PATCH 46/51] Format

---
 examples/histogram.jl         | 4 ++--
 examples/performant_matmul.jl | 2 +-
 src/intrinsics.jl             | 1 -
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/histogram.jl b/examples/histogram.jl
index 39f243c5..273c6d71 100644
--- a/examples/histogram.jl
+++ b/examples/histogram.jl
@@ -13,7 +13,7 @@ function create_histogram(input)
 end
 
 # This a 1D histogram kernel where the histogramming happens on static shmem
-function histogram_kernel!(histogram_output, input, ::Val{gs}) where gs
+function histogram_kernel!(histogram_output, input, ::Val{gs}) where {gs}
     gid = KI.get_group_id().x
     lid = KI.get_local_id().x
 
@@ -58,7 +58,7 @@ end
 function histogram!(histogram_output, input, groupsize = 256)
     backend = get_backend(histogram_output)
     # Need static block size
-    KI.@kernel backend workgroupsize=groupsize numworkgroups=cld(length(input), groupsize) histogram_kernel!(histogram_output, input, Val(groupsize))
+    KI.@kernel backend workgroupsize = groupsize numworkgroups = cld(length(input), groupsize) histogram_kernel!(histogram_output, input, Val(groupsize))
     return
 end
 
diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl
index f19fe75b..d9f1d063 100644
--- a/examples/performant_matmul.jl
+++ b/examples/performant_matmul.jl
@@ -81,7 +81,7 @@ B = rand!(allocate(backend, Float32, R, M))
 C = KernelAbstractions.zeros(backend, Float32, N, M)
 
 workgroupsize=(TILE_DIM, TILE_DIM)
-numworkgroups=(cld(size(C,1), TILE_DIM), cld(size(C,2), TILE_DIM))
+numworkgroups=(cld(size(C, 1), TILE_DIM), cld(size(C, 2), TILE_DIM))
 
 KI.@kernel backend workgroupsize numworkgroups coalesced_matmul_kernel!(C, A, B, N, R, M, Val(TILE_DIM))
 KernelAbstractions.synchronize(backend)
diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index e0b2c46a..2025bf5e 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -159,7 +159,6 @@ end
 function _print end
 
 
-
 """
     Kernel{Backend, Kern}
 

From 89b57e923b3d86dc285c227fba7e9952f998170b Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 11 Nov 2025 20:48:20 -0400
Subject: [PATCH 47/51] Format

---
 examples/performant_matmul.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl
index d9f1d063..0eee6e3c 100644
--- a/examples/performant_matmul.jl
+++ b/examples/performant_matmul.jl
@@ -80,8 +80,8 @@ A = rand!(allocate(backend, Float32, N, R))
 B = rand!(allocate(backend, Float32, R, M))
 C = KernelAbstractions.zeros(backend, Float32, N, M)
 
-workgroupsize=(TILE_DIM, TILE_DIM)
-numworkgroups=(cld(size(C, 1), TILE_DIM), cld(size(C, 2), TILE_DIM))
+workgroupsize = (TILE_DIM, TILE_DIM)
+numworkgroups = (cld(size(C, 1), TILE_DIM), cld(size(C, 2), TILE_DIM))
 
 KI.@kernel backend workgroupsize numworkgroups coalesced_matmul_kernel!(C, A, B, N, R, M, Val(TILE_DIM))
 KernelAbstractions.synchronize(backend)

From ec246c711dfbd06de72476c610b305a1e327538d Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 11 Nov 2025 20:53:30 -0400
Subject: [PATCH 48/51] Format

---
 examples/histogram.jl         | 2 +-
 examples/performant_matmul.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/histogram.jl b/examples/histogram.jl
index 273c6d71..29d3bf57 100644
--- a/examples/histogram.jl
+++ b/examples/histogram.jl
@@ -52,7 +52,7 @@ function histogram_kernel!(histogram_output, input, ::Val{gs}) where {gs}
         end
 
     end
-
+    return
 end
 
 function histogram!(histogram_output, input, groupsize = 256)
diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl
index 0eee6e3c..905b3a05 100644
--- a/examples/performant_matmul.jl
+++ b/examples/performant_matmul.jl
@@ -70,7 +70,7 @@ function coalesced_matmul_kernel!(
     if I <= N && J <= M
         @inbounds output[I, J] = outval
     end
-    return nothing
+    return
 end
 
 N = 1024

From 5620ba2d6ee6e774fbcc7b9680258075c13fe666 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 12 Nov 2025 17:11:36 -0400
Subject: [PATCH 49/51] Add kernel launch test

---
 src/intrinsics.jl   | 17 +++++++++++++++--
 src/pocl/backend.jl | 19 ++++++-------------
 test/intrinsics.jl  |  4 ++++
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index 2025bf5e..d7c91fbc 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -168,15 +168,28 @@ kernel on the host.
 !!! note
     Backend implementations **must** implement:
     ```
-    (kernel::Kernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...)
+    (kernel::Kernel{<:NewBackend})(args...; numworkgroups=1, workgroupsize=1)
     ```
-    As well as the on-device functionality.
+    With the `numworkgroups` and `workgroupsize` arguments accepting a scalar Integer
+    or or a 1, 2, or 3 Integer tuple and throwing an `ArgumentError` otherwise. The
+    helper function `KI.check_launch_args(numworkgroups, workgroupsize)` can be used
+    by the backend or a custom check can be implemented.
+
+    Backends must also implement the on-device kernel launch functionality.
 """
 struct Kernel{B, Kern}
     backend::B
     kern::Kern
 end
 
+function check_launch_args(numworkgroups, workgroupsize)
+    length(numworkgroups) <= 3 ||
+        throw(ArgumentError("`numworkgroups` only accepts up to 3 dimensions"))
+    length(workgroupsize) <= 3 ||
+        throw(ArgumentError("`workgroupsize` only accepts up to 3 dimensions"))
+    return
+end
+
 """
     kernel_max_work_group_size(kern; [max_work_items::Int])::Int
 
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 14fe6ae8..371891d5 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -146,20 +146,13 @@ function KI.kernel_function(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothin
     return KI.Kernel{POCLBackend, typeof(kern)}(POCLBackend(), kern)
 end
 
-function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing)
-    local_size = StaticArrays.MVector{3}((1, 1, 1))
-    if !isnothing(workgroupsize)
-        for (i, val) in enumerate(workgroupsize)
-            local_size[i] = val
-        end
-    end
+function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = 1, workgroupsize = 1)
+    KI.check_launch_args(numworkgroups, workgroupsize)
 
-    global_size = StaticArrays.MVector{3}((1, 1, 1))
-    if !isnothing(numworkgroups)
-        for (i, val) in enumerate(numworkgroups)
-            global_size[i] = val * local_size[i]
-        end
-    end
+    local_size = (workgroupsize..., ntuple(_->1, 3-length(workgroupsize))...,)
+
+    numworkgroups = (numworkgroups..., ntuple(_->1, 3-length(numworkgroups))...,)
+    global_size = local_size .* numworkgroups
 
     return obj.kern(args...; local_size, global_size)
 end
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index 7a7ec1e0..dff6751e 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -68,6 +68,10 @@ function intrinsics_testsuite(backend, AT)
             KI.@kernel backend() numworkgroups = (2, 2, 2) workgroupsize = (2, 2, 2) launch_kernel3d(arr3d)
             KernelAbstractions.synchronize(backend())
             @test all(Array(arr3d) .== 1)
+
+            # 4d (Errors)
+            @test_throws ArgumentError (KI.@kernel backend() numworkgroups = (2, 2, 2, 2) workgroupsize = (2, 2, 2) launch_kernel3d(arr3d))
+            @test_throws ArgumentError (KI.@kernel backend() numworkgroups = (2, 2, 2) workgroupsize = (2, 2, 2, 2) launch_kernel3d(arr3d))
         end
 
         @testset "Basic intrinsics functionality" begin

From b0013f112172044a8d9ff137db9074956abf4086 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 12 Nov 2025 22:32:04 -0400
Subject: [PATCH 50/51] Synchronize

---
 src/pocl/backend.jl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 371891d5..055e1c7d 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -154,7 +154,10 @@ function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = 1, workgroupsize
     numworkgroups = (numworkgroups..., ntuple(_->1, 3-length(numworkgroups))...,)
     global_size = local_size .* numworkgroups
 
-    return obj.kern(args...; local_size, global_size)
+    event = obj.kern(args...; local_size, global_size)
+    wait(event)
+    cl.clReleaseEvent(event)
+    return nothing
 end
 
 function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int

From da1d96dc1c931c652a49b7426d7a99c8324fa534 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Sun, 16 Nov 2025 19:08:31 +0100
Subject: [PATCH 51/51] don't export KI and fix POCL.synchronize

---
 examples/histogram.jl         |  2 ++
 examples/performant_matmul.jl |  3 +++
 src/KernelAbstractions.jl     |  4 ++--
 src/intrinsics.jl             |  7 ++-----
 src/pocl/backend.jl           | 13 ++++++++++---
 test/intrinsics.jl            |  2 ++
 6 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/examples/histogram.jl b/examples/histogram.jl
index 29d3bf57..f2704fd0 100644
--- a/examples/histogram.jl
+++ b/examples/histogram.jl
@@ -1,6 +1,8 @@
 # INCLUDE ROCM
 using KernelAbstractions, Test
 using KernelAbstractions: @atomic, @atomicswap, @atomicreplace
+import KernelAbstractions.KernelIntrinsics as KI
+
 include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend
 
 # Function to use as a baseline for CPU metrics
diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl
index 905b3a05..0a279d32 100644
--- a/examples/performant_matmul.jl
+++ b/examples/performant_matmul.jl
@@ -1,7 +1,10 @@
 using KernelAbstractions
+import KernelAbstractions.KernelIntrinsics as KI
+
 using StaticArrays
 using Test
 using Random
+
 include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend
 
 # We use a TILE_DIM of 16 as a safe value since while
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index a0f57c74..cdb4dd96 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -200,8 +200,8 @@ Abstract type for all KernelAbstractions backends.
 abstract type Backend end
 
 include("intrinsics.jl")
-import .KernelIntrinsics: KernelIntrinsics, KI
-export KernelIntrinsics, KI
+import .KernelIntrinsics as KI
+export KernelIntrinsics
 
 ###
 # Kernel language
diff --git a/src/intrinsics.jl b/src/intrinsics.jl
index d7c91fbc..80ccf6f0 100644
--- a/src/intrinsics.jl
+++ b/src/intrinsics.jl
@@ -1,7 +1,7 @@
 """
-# `KernelIntrinics`/`KI`
+# `KernelIntrinsics`
 
-The `KernelIntrinics` (or `KI`) module defines the API interface for backends to define various lower-level device and
+The `KernelIntrinsics` (or `KI`) module defines the API interface for backends to define various lower-level device and
 host-side functionality. The `KI` intrinsics are used to define the higher-level device-side
 intrinsics functionality in `KernelAbstractions`.
 
@@ -12,9 +12,6 @@ like allocating arrays on a backend.
 """
 module KernelIntrinsics
 
-const KI = KernelIntrinsics
-export KI
-
 import ..KernelAbstractions: Backend
 import GPUCompiler: split_kwargs, assign_args!
 
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 055e1c7d..f23e20b0 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -57,7 +57,14 @@ KA.functional(::POCLBackend) = true
 KA.pagelock!(::POCLBackend, x) = nothing
 
 KA.get_backend(::Array) = POCLBackend()
-KA.synchronize(::POCLBackend) = cl.finish(cl.queue())
+
+## Implementation note:
+## The POCL backend uses `Base.Array` as it's array type, so the external operations
+## `broadcast`, `*` and other high-level operations are handled by Julia. In order
+## to provide the same memory synchronization semantics as other backends, we
+## must synchronize upon kernel launch and can't rely on synchronization upon
+## array access. Therefore, `synchronize` is a no-op.
+KA.synchronize(::POCLBackend) = nothing
 KA.supports_float64(::POCLBackend) = true
 KA.supports_unified(::POCLBackend) = true
 
@@ -149,9 +156,9 @@ end
 function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = 1, workgroupsize = 1)
     KI.check_launch_args(numworkgroups, workgroupsize)
 
-    local_size = (workgroupsize..., ntuple(_->1, 3-length(workgroupsize))...,)
+    local_size = (workgroupsize..., ntuple(_ -> 1, 3 - length(workgroupsize))...)
 
-    numworkgroups = (numworkgroups..., ntuple(_->1, 3-length(numworkgroups))...,)
+    numworkgroups = (numworkgroups..., ntuple(_ -> 1, 3 - length(numworkgroups))...)
     global_size = local_size .* numworkgroups
 
     event = obj.kern(args...; local_size, global_size)
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index dff6751e..97548c47 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -1,3 +1,5 @@
+import KernelAbstractions.KernelIntrinsics as KI
+
 function test_intrinsics_kernel(results)
     # Test all intrinsics return NamedTuples with x, y, z fields
     global_size = KI.get_global_size()