-
Notifications
You must be signed in to change notification settings - Fork 15.1k
Open
Labels
llvm:optimizationsmissed-optimizationsimdAnything related to std::simd or vectorization in generalAnything related to std::simd or vectorization in general
Description
(The original code is from a Rust JPEG encoder where we wanted to push every third byte from an __m256i into a buffer.)
Trying to interpret an __m256i as int32x8 and truncate it to uint8x8 misses shuffling the bytes in the high lane before combining with the low lane. I don't know what the best instruction sequence would be for this shuffle.
https://godbolt.org/z/oj9acsErE
alive2 proof - https://alive2.llvm.org/ce/z/WzN-QY
#include <immintrin.h>
#include <stdint.h>
void src(__m256i data, uint8_t* out) {
for (int i = 0; i < 8; i++, out++) {
*out = (uint8_t)((int32_t*)&data)[i];
}
return;
}
void tgt(__m256i data, uint8_t* out) {
// Shuffle mask = [0, 4, 8, 12]
__m256i A = _mm256_shuffle_epi8(data, _mm256_set1_epi32(0x0c080400));
// Shuffle lowest int32 from hi 128-bits into second int32 lane
__m256i B = _mm256_permutevar8x32_epi32(A, _mm256_setr_epi32(0, 4, 0, 0, 0, 0, 0, 0));
for (int i = 0; i < 8; i++, out++) {
*out = ((uint8_t*)&B)[i];
}
return;
}.LCPI0_0:
.byte 0
.byte 4
.byte 8
.byte 12
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
src:
vextracti128 xmm1, ymm0, 1
vpextrb eax, xmm1, 4
vpextrb ecx, xmm1, 8
vpextrb edx, xmm1, 12
vpshufb xmm0, xmm0, xmmword ptr [rip + .LCPI0_0]
vpunpckldq xmm0, xmm0, xmm1
vpinsrb xmm0, xmm0, eax, 5
vpinsrb xmm0, xmm0, ecx, 6
vpinsrb xmm0, xmm0, edx, 7
vmovq qword ptr [rdi], xmm0
vzeroupper
ret
.LCPI1_0:
.byte 0 .byte 4 .byte 8 .byte 12 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .byte 0 .byte 4 .byte 8 .byte 12 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1
tgt:
vpshufb ymm0, ymm0, ymmword ptr [rip + .LCPI1_0]
vextracti128 xmm1, ymm0, 1
vpunpckldq xmm0, xmm0, xmm1
vmovq qword ptr [rdi], xmm0
vzeroupper
retLLVM IR
define void @src(<4 x i64> noundef %data, ptr nocapture noread noundef initializes((0, 8)) %out) {
entry:
%#0 = bitcast <4 x i64> noundef %data to <32 x i8>
%#1 = bitcast <4 x i64> noundef %data to <32 x i8>
%#2 = bitcast <4 x i64> noundef %data to <32 x i8>
%conv.2 = extractelement <32 x i8> %#2, i64 8
%#3 = bitcast <4 x i64> noundef %data to <32 x i8>
%conv.3 = extractelement <32 x i8> %#3, i64 12
%#4 = bitcast <4 x i64> noundef %data to <32 x i8>
%conv.4 = extractelement <32 x i8> %#4, i64 16
%#5 = bitcast <4 x i64> noundef %data to <32 x i8>
%conv.5 = extractelement <32 x i8> %#5, i64 20
%#6 = bitcast <4 x i64> noundef %data to <32 x i8>
%conv.6 = extractelement <32 x i8> %#6, i64 24
%#7 = bitcast <4 x i64> noundef %data to <32 x i8>
%conv.7 = extractelement <32 x i8> %#7, i64 28
%#8 = shufflevector <32 x i8> %#0, <32 x i8> %#1, 0, 36, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295
%#9 = insertelement <8 x i8> %#8, i8 %conv.2, i64 2
%#10 = insertelement <8 x i8> %#9, i8 %conv.3, i64 3
%#11 = insertelement <8 x i8> %#10, i8 %conv.4, i64 4
%#12 = insertelement <8 x i8> %#11, i8 %conv.5, i64 5
%#13 = insertelement <8 x i8> %#12, i8 %conv.6, i64 6
%#14 = insertelement <8 x i8> %#13, i8 %conv.7, i64 7
store <8 x i8> %#14, ptr nocapture noread noundef initializes((0, 8)) %out, align 1
ret void
}
=>
define void @tgt(<4 x i64> noundef %data, ptr nocapture noread noundef initializes((0, 8)) %out) {
entry:
%#0 = bitcast <4 x i64> noundef %data to <32 x i8>
%#1 = shufflevector <32 x i8> %#0, <32 x i8> poison, 0, 4, 8, 12, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 16, 20, 24, 28, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295
%#2 = bitcast <32 x i8> %#1 to <8 x i32>
%B.sroa.0.0.vec.extract = shufflevector <8 x i32> %#2, <8 x i32> poison, 0, 4
store <2 x i32> %B.sroa.0.0.vec.extract, ptr nocapture noread noundef initializes((0, 8)) %out, align 1
ret void
}
Transformation seems to be correct!
Written as the following, the shuffle is recognized in C (but doesn't help in Rust).
void src8(__m256i data, uint8_t* out) {
for (int i = 0; i < 32; out++) {
*out = ((uint8_t*)&data)[i];
i += 4;
}
return;
}.LCPI0_1:
.byte 0
.byte 4
.byte 8
.byte 12
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
.byte 0
src8:
vextracti128 xmm1, ymm0, 1
vmovd xmm2, dword ptr [rip + .LCPI0_1]
vpshufb xmm1, xmm1, xmm2
vpshufb xmm0, xmm0, xmm2
vpunpckldq xmm0, xmm0, xmm1
vmovq qword ptr [rdi], xmm0
vzeroupper
retdefine dso_local void @src8(<4 x i64> noundef %data, ptr noundef writeonly captures(none) initializes((0, 8)) %out) local_unnamed_addr {
entry:
%0 = bitcast <4 x i64> %data to <32 x i8>
%1 = shufflevector <32 x i8> %0, <32 x i8> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
store <8 x i8> %1, ptr %out, align 1
ret void
}Metadata
Metadata
Assignees
Labels
llvm:optimizationsmissed-optimizationsimdAnything related to std::simd or vectorization in generalAnything related to std::simd or vectorization in general