Skip to content

[x86] Missed optimization for AVX2 truncating cast from v8i32 to v8i8 #167138

@okaneco

Description

@okaneco

(The original code is from a Rust JPEG encoder where we wanted to push every third byte from an __m256i into a buffer.)

Trying to interpret an __m256i as int32x8 and truncate it to uint8x8 misses shuffling the bytes in the high lane before combining with the low lane. I don't know what the best instruction sequence would be for this shuffle.

https://godbolt.org/z/oj9acsErE
alive2 proof - https://alive2.llvm.org/ce/z/WzN-QY

#include <immintrin.h>
#include <stdint.h>

void src(__m256i data, uint8_t* out) {
    for (int i = 0; i < 8; i++, out++) {
        *out = (uint8_t)((int32_t*)&data)[i];
    }
    return;
}

void tgt(__m256i data, uint8_t* out) {
    // Shuffle mask = [0, 4, 8, 12]
    __m256i A = _mm256_shuffle_epi8(data, _mm256_set1_epi32(0x0c080400));
    // Shuffle lowest int32 from hi 128-bits into second int32 lane
    __m256i B = _mm256_permutevar8x32_epi32(A, _mm256_setr_epi32(0, 4, 0, 0, 0, 0, 0, 0));

    for (int i = 0; i < 8; i++, out++) {
        *out = ((uint8_t*)&B)[i];
    }
    return;
}
.LCPI0_0:
        .byte   0
        .byte   4
        .byte   8
        .byte   12
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
src:
        vextracti128    xmm1, ymm0, 1
        vpextrb eax, xmm1, 4
        vpextrb ecx, xmm1, 8
        vpextrb edx, xmm1, 12
        vpshufb xmm0, xmm0, xmmword ptr [rip + .LCPI0_0]
        vpunpckldq      xmm0, xmm0, xmm1
        vpinsrb xmm0, xmm0, eax, 5
        vpinsrb xmm0, xmm0, ecx, 6
        vpinsrb xmm0, xmm0, edx, 7
        vmovq   qword ptr [rdi], xmm0
        vzeroupper
        ret

.LCPI1_0:
        .byte 0 .byte 4 .byte 8 .byte 12 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .byte 0 .byte 4 .byte 8 .byte 12 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1
tgt:
        vpshufb ymm0, ymm0, ymmword ptr [rip + .LCPI1_0]
        vextracti128    xmm1, ymm0, 1
        vpunpckldq      xmm0, xmm0, xmm1
        vmovq   qword ptr [rdi], xmm0
        vzeroupper
        ret
LLVM IR
define void @src(<4 x i64> noundef %data, ptr nocapture noread noundef initializes((0, 8)) %out) {
entry:
  %#0 = bitcast <4 x i64> noundef %data to <32 x i8>
  %#1 = bitcast <4 x i64> noundef %data to <32 x i8>
  %#2 = bitcast <4 x i64> noundef %data to <32 x i8>
  %conv.2 = extractelement <32 x i8> %#2, i64 8
  %#3 = bitcast <4 x i64> noundef %data to <32 x i8>
  %conv.3 = extractelement <32 x i8> %#3, i64 12
  %#4 = bitcast <4 x i64> noundef %data to <32 x i8>
  %conv.4 = extractelement <32 x i8> %#4, i64 16
  %#5 = bitcast <4 x i64> noundef %data to <32 x i8>
  %conv.5 = extractelement <32 x i8> %#5, i64 20
  %#6 = bitcast <4 x i64> noundef %data to <32 x i8>
  %conv.6 = extractelement <32 x i8> %#6, i64 24
  %#7 = bitcast <4 x i64> noundef %data to <32 x i8>
  %conv.7 = extractelement <32 x i8> %#7, i64 28
  %#8 = shufflevector <32 x i8> %#0, <32 x i8> %#1, 0, 36, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295
  %#9 = insertelement <8 x i8> %#8, i8 %conv.2, i64 2
  %#10 = insertelement <8 x i8> %#9, i8 %conv.3, i64 3
  %#11 = insertelement <8 x i8> %#10, i8 %conv.4, i64 4
  %#12 = insertelement <8 x i8> %#11, i8 %conv.5, i64 5
  %#13 = insertelement <8 x i8> %#12, i8 %conv.6, i64 6
  %#14 = insertelement <8 x i8> %#13, i8 %conv.7, i64 7
  store <8 x i8> %#14, ptr nocapture noread noundef initializes((0, 8)) %out, align 1
  ret void
}
=>
define void @tgt(<4 x i64> noundef %data, ptr nocapture noread noundef initializes((0, 8)) %out) {
entry:
  %#0 = bitcast <4 x i64> noundef %data to <32 x i8>
  %#1 = shufflevector <32 x i8> %#0, <32 x i8> poison, 0, 4, 8, 12, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 16, 20, 24, 28, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295
  %#2 = bitcast <32 x i8> %#1 to <8 x i32>
  %B.sroa.0.0.vec.extract = shufflevector <8 x i32> %#2, <8 x i32> poison, 0, 4
  store <2 x i32> %B.sroa.0.0.vec.extract, ptr nocapture noread noundef initializes((0, 8)) %out, align 1
  ret void
}
Transformation seems to be correct!

Written as the following, the shuffle is recognized in C (but doesn't help in Rust).

void src8(__m256i data, uint8_t* out) {
    for (int i = 0; i < 32; out++) {
        *out = ((uint8_t*)&data)[i];
        i += 4;
    }
    return;
}
.LCPI0_1:
        .byte   0
        .byte   4
        .byte   8
        .byte   12
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
src8:
        vextracti128    xmm1, ymm0, 1
        vmovd   xmm2, dword ptr [rip + .LCPI0_1]
        vpshufb xmm1, xmm1, xmm2
        vpshufb xmm0, xmm0, xmm2
        vpunpckldq      xmm0, xmm0, xmm1
        vmovq   qword ptr [rdi], xmm0
        vzeroupper
        ret
define dso_local void @src8(<4 x i64> noundef %data, ptr noundef writeonly captures(none) initializes((0, 8)) %out) local_unnamed_addr {
entry:
  %0 = bitcast <4 x i64> %data to <32 x i8>
  %1 = shufflevector <32 x i8> %0, <32 x i8> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
  store <8 x i8> %1, ptr %out, align 1
  ret void
}

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions