Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
70ef83a
define basic intrinsics
vchuravy Feb 4, 2025
02f3124
Fix docstrings
christiangnrd Sep 16, 2025
1103574
localmemory
christiangnrd Sep 16, 2025
8f40452
Move default barrier implementation to KernelIntrinsics
christiangnrd Sep 18, 2025
471dc58
Implement remaining intrinsics for POCL
christiangnrd Sep 18, 2025
a8a14a1
Add basic tests
christiangnrd Sep 18, 2025
96d9ffc
Int32 -> Int
christiangnrd Oct 1, 2025
685a7d4
Format
christiangnrd Oct 2, 2025
a54fb5d
Launch interface
christiangnrd Oct 22, 2025
5a6cd9b
Docs
christiangnrd Oct 22, 2025
faa5213
New stuff docs
christiangnrd Oct 22, 2025
6baa445
Fix
christiangnrd Oct 22, 2025
1953e8d
Temp adaptation to test in-progress interface with CUDA.jl
christiangnrd Oct 22, 2025
ba52bbb
Better launch interface
christiangnrd Nov 4, 2025
1fb74a0
Only keep common compiler kwarg
christiangnrd Nov 4, 2025
36c67ce
Fixup tests
christiangnrd Nov 5, 2025
0636279
No `backend` in macro kwargs
christiangnrd Nov 5, 2025
48c6b8b
Tests
christiangnrd Nov 5, 2025
cc240c6
Fixes
christiangnrd Nov 5, 2025
f41d1e0
More fix
christiangnrd Nov 5, 2025
8e97be1
Print
christiangnrd Nov 5, 2025
bf37826
tweak
christiangnrd Nov 5, 2025
cae65fe
Fix
christiangnrd Nov 5, 2025
8861658
Ugh
christiangnrd Nov 5, 2025
86c18fc
Format
christiangnrd Nov 5, 2025
0a6f172
Format
christiangnrd Nov 5, 2025
2371f88
drbh
christiangnrd Nov 5, 2025
0a45913
`_print` docstring
christiangnrd Nov 6, 2025
ff3b077
Test all launch size options
christiangnrd Nov 6, 2025
0732ae0
Fix backend
christiangnrd Nov 6, 2025
7d6be1b
Format
christiangnrd Nov 6, 2025
04f21ec
More consistent docstrings
christiangnrd Nov 6, 2025
d068b80
Qualify `map`
christiangnrd Nov 6, 2025
2fe4502
`KIKernel` -> `Kernel`
christiangnrd Nov 6, 2025
aaf20a7
Remove redundant parameter
christiangnrd Nov 6, 2025
e2941fb
`kiconvert` -> `argconvert`
christiangnrd Nov 6, 2025
5262c9e
Remove old definition
christiangnrd Nov 6, 2025
ca11482
Unbreak ABI
christiangnrd Nov 6, 2025
d6f3e66
Readd old definition
christiangnrd Nov 7, 2025
1400c9f
Naming
christiangnrd Nov 7, 2025
055a81f
Reword
christiangnrd Nov 7, 2025
913f8c8
Docstrings
christiangnrd Nov 7, 2025
298e807
Rename `kikernel`
christiangnrd Nov 7, 2025
42f17d6
Renaming
christiangnrd Nov 7, 2025
da98ac8
Update `unsafe_indices` examples to use KI directly
christiangnrd Nov 12, 2025
8510dfe
Format
christiangnrd Nov 12, 2025
89b57e9
Format
christiangnrd Nov 12, 2025
ec246c7
Format
christiangnrd Nov 12, 2025
5620ba2
Add kernel launch test
christiangnrd Nov 12, 2025
b0013f1
Synchronize
christiangnrd Nov 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 10 additions & 12 deletions examples/histogram.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,15 @@ function create_histogram(input)
return histogram_output
end

# This a 1D histogram kernel where the histogramming happens on shmem
@kernel unsafe_indices = true function histogram_kernel!(histogram_output, input)
gid = @index(Group, Linear)
lid = @index(Local, Linear)
# This a 1D histogram kernel where the histogramming happens on static shmem
function histogram_kernel!(histogram_output, input, ::Val{gs}) where {gs}
gid = KI.get_group_id().x
lid = KI.get_local_id().x

@uniform gs = prod(@groupsize())
tid = (gid - 1) * gs + lid
@uniform N = length(histogram_output)
N = length(histogram_output)

shared_histogram = @localmem eltype(input) (gs)
shared_histogram = KI.localmemory(eltype(input), gs)

# This will go through all input elements and assign them to a location in
# shmem. Note that if there is not enough shem, we create different shmem
Expand All @@ -32,7 +31,7 @@ end

# Setting shared_histogram to 0
@inbounds shared_histogram[lid] = 0
@synchronize()
KI.barrier()

max_element = min_element + gs
if max_element > N
Expand All @@ -46,21 +45,20 @@ end
@atomic shared_histogram[bin] += 1
end

@synchronize()
KI.barrier()

if ((lid + min_element - 1) <= N)
@atomic histogram_output[lid + min_element - 1] += shared_histogram[lid]
end

end

return
end

function histogram!(histogram_output, input, groupsize = 256)
backend = get_backend(histogram_output)
# Need static block size
kernel! = histogram_kernel!(backend, (groupsize,))
kernel!(histogram_output, input, ndrange = size(input))
KI.@kernel backend workgroupsize = groupsize numworkgroups = cld(length(input), groupsize) histogram_kernel!(histogram_output, input, Val(groupsize))
return
end

Expand Down
61 changes: 30 additions & 31 deletions examples/performant_matmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,70 +9,68 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) #
# Metal sometimes supports fewer.
const TILE_DIM = 16

@kernel unsafe_indices = true function coalesced_matmul_kernel!(
output, @Const(input1), @Const(input2), N, R, M,
::Val{BANK} = Val(1),
) where {BANK}
gi, gj = @index(Group, NTuple)
i, j = @index(Local, NTuple)

TILE_DIM = @uniform @groupsize()[1]
function coalesced_matmul_kernel!(
output, input1, input2, N, R, M,
::Val{TDIM}, ::Val{BANK} = Val(1)
) where {TDIM, BANK}
gi, gj, _ = KI.get_group_id()
i, j, _ = KI.get_local_id()

# +1 to avoid bank conflicts on shared memory
tile1 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
tile2 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
tile1 = KI.localmemory(eltype(output), (TDIM + BANK, TDIM))
tile2 = KI.localmemory(eltype(output), (TDIM + BANK, TDIM))

# private variable for tile output
outval = @private eltype(output) 1
@inbounds outval[1] = -zero(eltype(output))
# variable for tile output
outval = -zero(eltype(output))

@uniform N = size(output, 1)
N = size(output, 1)
# number of tiles depends on inner dimension
@uniform NUM_TILES = div(R + TILE_DIM - 1, TILE_DIM)
NUM_TILES = div(R + TDIM - 1, TDIM)

# loop over all tiles needed for this calculation
for t in 0:(NUM_TILES - 1)
# Can't use @index(Global), because we use a smaller ndrange
I = (gi - 1) * TILE_DIM + i
J = (gj - 1) * TILE_DIM + j
I = (gi - 1) * TDIM + i
J = (gj - 1) * TDIM + j

# load inputs into tiles, with bounds checking for non-square matrices
if I <= N && t * TILE_DIM + j <= R
@inbounds tile1[i, j] = input1[I, t * TILE_DIM + j]
if I <= N && t * TDIM + j <= R
@inbounds tile1[i, j] = input1[I, t * TDIM + j]
else
@inbounds tile1[i, j] = 0.0
end
if t * TILE_DIM + i <= R && J <= M
@inbounds tile2[i, j] = input2[t * TILE_DIM + i, J]
@inbounds tile2[i, j] = input2[t * TDIM + i, J]
else
@inbounds tile2[i, j] = 0.0
end

# wait for all tiles to be loaded
@synchronize
KI.barrier()

# get global values again
I = (gi - 1) * TILE_DIM + i
J = (gj - 1) * TILE_DIM + j
I = (gi - 1) * TDIM + i
J = (gj - 1) * TDIM + j

# calculate value of spot in output, use temporary value to allow for vectorization
out = zero(eltype(output))
@simd for k in 1:TILE_DIM
@simd for k in 1:TDIM
@inbounds out += tile1[i, k] * tile2[k, j]
end
outval[1] += out
outval += out

@synchronize
KI.barrier()
end

# get global indices again
I = (gi - 1) * TILE_DIM + i
J = (gj - 1) * TILE_DIM + j
I = (gi - 1) * TDIM + i
J = (gj - 1) * TDIM + j

# save if inbounds
if I <= N && J <= M
@inbounds output[I, J] = outval[1]
@inbounds output[I, J] = outval
end
return
end

N = 1024
Expand All @@ -82,9 +80,10 @@ A = rand!(allocate(backend, Float32, N, R))
B = rand!(allocate(backend, Float32, R, M))
C = KernelAbstractions.zeros(backend, Float32, N, M)

kern = coalesced_matmul_kernel!(backend, (TILE_DIM, TILE_DIM))
workgroupsize = (TILE_DIM, TILE_DIM)
numworkgroups = (cld(size(C, 1), TILE_DIM), cld(size(C, 2), TILE_DIM))

kern(C, A, B, N, R, M, ndrange = size(C))
KI.@kernel backend workgroupsize numworkgroups coalesced_matmul_kernel!(C, A, B, N, R, M, Val(TILE_DIM))
KernelAbstractions.synchronize(backend)

@test isapprox(A * B, C)
83 changes: 51 additions & 32 deletions src/KernelAbstractions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,15 @@ function unsafe_free! end

unsafe_free!(::AbstractArray) = return

"""
Abstract type for all KernelAbstractions backends.
"""
abstract type Backend end

include("intrinsics.jl")
import .KernelIntrinsics: KernelIntrinsics, KI
export KernelIntrinsics, KI

###
# Kernel language
# - @localmem
Expand Down Expand Up @@ -360,6 +369,25 @@ macro context()
return esc(:(__ctx__))
end

# Defined to keep cpu support for `__print`
@generated function KI._print(items...)
str = ""
args = []

for i in 1:length(items)
item = :(items[$i])
T = items[i]
if T <: Val
item = QuoteNode(T.parameters[1])
end
push!(args, item)
end

return quote
print($(args...))
end
end

"""
@print(items...)

Expand Down Expand Up @@ -460,13 +488,27 @@ end
# Internal kernel functions
###

function __index_Local_Linear end
function __index_Group_Linear end
function __index_Global_Linear end
@inline function __index_Local_Linear(ctx)
return KI.get_local_id().x
end

@inline function __index_Group_Linear(ctx)
return KI.get_group_id().x
end

function __index_Local_Cartesian end
function __index_Group_Cartesian end
function __index_Global_Cartesian end
@inline function __index_Global_Linear(ctx)
return KI.get_global_id().x
end

@inline function __index_Local_Cartesian(ctx)
return @inbounds workitems(__iterspace(ctx))[KI.get_local_id().x]
end
@inline function __index_Group_Cartesian(ctx)
return @inbounds blocks(__iterspace(ctx))[KI.get_group_id().x]
end
@inline function __index_Global_Cartesian(ctx)
return @inbounds expand(__iterspace(ctx), KI.get_group_id().x, KI.get_local_id().x)
end

@inline __index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...))
@inline __index_Group_NTuple(ctx, I...) = Tuple(__index_Group_Cartesian(ctx, I...))
Expand All @@ -482,11 +524,6 @@ constify(arg) = adapt(ConstAdaptor(), arg)
# Backend hierarchy
###

"""

Abstract type for all KernelAbstractions backends.
"""
abstract type Backend end

"""
Abstract type for all GPU based KernelAbstractions backends.
Expand Down Expand Up @@ -796,29 +833,11 @@ include("macros.jl")
###

function Scratchpad end
function SharedMemory end

function __synchronize()
error("@synchronize used outside kernel or not captured")
end

@generated function __print(items...)
str = ""
args = []
SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(t, dims)

for i in 1:length(items)
item = :(items[$i])
T = items[i]
if T <: Val
item = QuoteNode(T.parameters[1])
end
push!(args, item)
end
__synchronize() = KI.barrier()

return quote
print($(args...))
end
end
__print(args...) = KI._print(args...)

# Utils
__size(args::Tuple) = Tuple{args...}
Expand Down
Loading
Loading