@@ -39,31 +39,35 @@ struct AlignedColMajor{T} <: LayoutBase{T} end
3939
4040# TODO : cleanup vectorisation
4141@inline function load (:: Type{AlignedColMajor{T}} , workspace, tile:: Tile{size} ) where {T, size}
42- res = MArray {Tuple{size[1], size[2]}, T} (undef)
42+ vec_len = 16 ÷ sizeof (T)
43+ N = (sizeof (T) * vec_len) ÷ sizeof (Float32)
44+ res = MArray {Tuple{size[1] ÷ vec_len, size[2]}, NTuple{N, VecElement{Float32}}} (undef)
4345
4446 @unroll for j = 1 : size[2 ]
45- @unroll for i = 1 : size[1 ]
47+ @unroll for i = 1 : vec_len : size[1 ]
4648 t = translate (tile, (i - 1 , j - 1 ))
4749
4850 linear_base = linearise (t. base, Base. size (workspace))
4951 linear_offset = linearise (t. offset, Base. size (workspace))
5052
51- @inbounds res[i, j] = workspace[linear_base + linear_offset - 1 ]
53+ @inbounds res[i, j] = vloada (Vec{vec_len, T}, pointer (workspace, linear_base), linear_offset)
5254 end
5355 end
5456
5557 return res
5658end
5759
5860@inline function store! (:: Type{AlignedColMajor{T}} , workspace, value, tile:: Tile{size} ) where {T, size}
61+ vec_len = 16 ÷ sizeof (T)
62+
5963 @unroll for j = 1 : size[2 ]
60- @unroll for i = 1 : size[1 ]
64+ @unroll for i = 1 : vec_len : size[1 ]
6165 t = translate (tile, (i - 1 , j - 1 ))
6266
6367 linear_base = linearise (t. base, Base. size (workspace))
6468 linear_offset = linearise (t. offset, Base. size (workspace))
6569
66- @inbounds workspace[ linear_base + linear_offset - 1 ] = value[i,j]
70+ vstorea! (Vec{vec_len, T}, pointer ( workspace, linear_base), value[i, j], linear_offset)
6771 end
6872 end
6973end
0 commit comments