Skip to content

Commit d9c4ea4

Browse files
ikawrakowIwan Kawrakow
andauthored
Interleave 8 rows (Q8_0, IQ4_XS) (#178)
* Try interleaving 8 rows for iq4_xs On Zen4, PP-512 goes up from ~260 t/s to 288 t/s for L3-8B. TG-128 reaches max. performance at 2 threads and is slightly higher than 4 interleaved rows (14.48 t/s vs 13.11 t/s @ 2 threads and 14/28 t/s @ 4 threads). * Try interleaving 8 iq4_xs rows It is also faster on AVX2. This is the NEON implementation. It is tiny bit faster than 4 interleaved rows (~0.5%). So, this looks like a winner given the Zen4/AVX2 improvement without associated NEON egression. * Cleanup * 8-rows interleaved q8_0 (AVX2) * 8-rows interleaved q8_0 (Zen4) * 8-rows interleaved q8_0 (Zen4) - slightly better PP-512 is now 284 t/s compared to 257 t/s for 4-rows interleaved. TG-128 reaches peak of 8.16 t/s at just 2 threads compared to 7.95 t/s @ 4 threads before. * 8-rows interleaved q8_0 (NEON) PP-512 is slightly better (138 t/s vs 132.5 t/s), TG-128 is about the same. * FA: repack Q8_0 to Q8_0_R8 * Remove special purpose mul_mat_q8_0_r4_q8_1_128 (Zen4) * FA: repack Q8_0 to Q8_0_R8 (NEON) Very slightly faster than the general purpose gemm, slightly slower than the D = 128 special case gemm mul_mat_q8_0_r4_q8_0_128. Still removing mul_mat_q8_0_r4_q8_0_128 as we simply don't have enough vector registers to hold 8 interleaved rows, so there is no point to have the special purpose implementation. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
1 parent 814d3e0 commit d9c4ea4

File tree

6 files changed

+437
-431
lines changed

6 files changed

+437
-431
lines changed

ggml/src/ggml-common.h

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,11 @@ typedef struct {
236236
int8_t qs[4*QK8_0];
237237
} block_q8_0_x4;
238238
static_assert(sizeof(block_q8_0_x4) == 4*sizeof(block_q8_0), "wrong q8_0_x4 block size/padding");
239+
typedef struct {
240+
ggml_half d[8];
241+
int8_t qs[8*QK8_0];
242+
} block_q8_0_r8;
243+
static_assert(sizeof(block_q8_0_r8) == 8*sizeof(block_q8_0), "wrong q8_0_r8 block size/padding");
239244

240245
typedef struct {
241246
ggml_half d[4]; // deltas for 4 q4_0 blocks
@@ -534,12 +539,12 @@ typedef struct {
534539
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
535540

536541
typedef struct {
537-
ggml_half d[4];
538-
uint8_t scales_h[QK_K/32];
539-
uint8_t scales_l[QK_K/16];
540-
uint8_t qs[QK_K*2];
542+
ggml_half d[8];
543+
uint8_t scales_h[QK_K/16];
544+
uint8_t scales_l[QK_K/ 8];
545+
uint8_t qs[QK_K*4];
541546
} block_iq4_xs_r4;
542-
static_assert(sizeof(block_iq4_xs_r4) == 4*sizeof(ggml_half) + QK_K/32 + QK_K/16 + QK_K*2, "wrong iq4_xs_rs block size/padding");
547+
static_assert(sizeof(block_iq4_xs_r4) == 8*sizeof(block_iq4_xs), "wrong iq4_xs_rs block size/padding");
543548

544549
typedef struct {
545550
uint8_t scales[QK_K/32];

ggml/src/ggml-quants.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -936,7 +936,6 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
936936

937937
#if defined(__ARM_NEON)
938938
for (int i = 0; i < nb; i++) {
939-
int i4 = i/4, ir = i%4;
940939
float32x4_t srcv [8];
941940
float32x4_t asrcv[8];
942941
float32x4_t amaxv[8];

0 commit comments

Comments
 (0)