From b972b532379693916411a6919aa756d1a603a83d Mon Sep 17 00:00:00 2001 From: okaneco <47607823+okaneco@users.noreply.github.com> Date: Wed, 5 Nov 2025 15:52:01 -0500 Subject: [PATCH] Explicitly vectorize i32x8 to u8x8 conversion for storing into YCbCr Remove unsafe transmute in AVX2 YCbCr conversion Prior to this, the conversion was scalarized and used a bswap. Rewriting the code to avoid reversing the array resulted in worse codegen that extracted the bytes and manually re-inserted them back into the SIMD register to store 8 bytes at once. --- src/avx2/ycbcr.rs | 53 +++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/src/avx2/ycbcr.rs b/src/avx2/ycbcr.rs index 29d9d4f..54a50f7 100644 --- a/src/avx2/ycbcr.rs +++ b/src/avx2/ycbcr.rs @@ -1,13 +1,15 @@ #[cfg(target_arch = "x86")] use core::arch::x86::{ - __m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set1_epi32, _mm256_set_epi32, - _mm256_srli_epi32, _mm256_sub_epi32, + __m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_mullo_epi32, + _mm256_permutevar8x32_epi32, _mm256_set1_epi32, _mm256_setr_epi32, _mm256_shuffle_epi8, + _mm256_sub_epi32, _mm_cvtsi128_si64, }; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::{ - __m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set1_epi32, _mm256_set_epi32, - _mm256_srli_epi32, _mm256_sub_epi32, + __m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_mullo_epi32, + _mm256_permutevar8x32_epi32, _mm256_set1_epi32, _mm256_setr_epi32, _mm256_shuffle_epi8, + _mm256_sub_epi32, _mm_cvtsi128_si64, }; use alloc::vec::Vec; @@ -27,7 +29,7 @@ macro_rules! ycbcr_image_avx2 { #[target_feature(enable = "avx2")] fn load3(data: &[u8]) -> __m256i { _ = data[7 * $num_colors]; // dummy indexing operation up front to avoid bounds checks later - _mm256_set_epi32( + _mm256_setr_epi32( data[0] as i32, data[1 * $num_colors] as i32, data[2 * $num_colors] as i32, @@ -41,13 +43,22 @@ macro_rules! ycbcr_image_avx2 { #[inline] #[target_feature(enable = "avx2")] - fn avx_as_i32_array(data: __m256i) -> [i32; 8] { - // Safety preconditions. Optimized away in release mode, no runtime cost. - assert!(core::mem::size_of::<__m256i>() == core::mem::size_of::<[i32; 8]>()); - assert!(core::mem::align_of::<__m256i>() >= core::mem::align_of::<[i32; 8]>()); - // SAFETY: size and alignment preconditions checked above. - // Both types are plain old data: no pointers, lifetimes, etc. - unsafe { core::mem::transmute(data) } + fn avx_as_u8_array(data: __m256i) -> [u8; 8] { + // Select the third byte of every i32, pack the bytes into + // the low i32 byte of each 128-bit lane: idx = (4 * i) + 2 + let u8_shuffle_mask = _mm256_set1_epi32(i32::from_le_bytes([2, 6, 10, 14])); + // Shuffle the lowest i32 lane from the high 128-bit lane + // into the second i32 + let u32_permute_mask = _mm256_setr_epi32(0, 4, 0, 0, 0, 0, 0, 0); + + // ABCD EFGH IJKL MNOP becomes CGKO CGKO CGKO CGKO, for both lanes + let a = _mm256_shuffle_epi8(data, u8_shuffle_mask); + // A... B... becomes AB.. .... + let a = _mm256_permutevar8x32_epi32(a, u32_permute_mask); + + // No-op cast to __m128i and extract the lower 64-bits + let out = _mm256_castsi256_si128(a); + _mm_cvtsi128_si64(out).to_le_bytes() } let [y_buffer, cb_buffer, cr_buffer, _] = buffers; @@ -82,11 +93,7 @@ macro_rules! ycbcr_image_avx2 { let y = _mm256_add_epi32(_mm256_add_epi32(yr, yg), yb); let y = _mm256_add_epi32(y, _mm256_set1_epi32(0x7FFF)); - let y = _mm256_srli_epi32(y, 16); - let y: [i32; 8] = avx_as_i32_array(y); - let mut y: [u8; 8] = y.map(|x| x as u8); - y.reverse(); - y_buffer.extend_from_slice(&y); + y_buffer.extend(avx_as_u8_array(y)); let cbr = _mm256_mullo_epi32(cbmulr, r); let cbg = _mm256_mullo_epi32(cbmulg, g); @@ -95,11 +102,7 @@ macro_rules! ycbcr_image_avx2 { let cb = _mm256_add_epi32(_mm256_sub_epi32(cbr, cbg), cbb); let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(128 << 16)); let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(0x7FFF)); - let cb = _mm256_srli_epi32(cb, 16); - let cb: [i32; 8] = avx_as_i32_array(cb); - let mut cb: [u8; 8] = cb.map(|x| x as u8); - cb.reverse(); - cb_buffer.extend_from_slice(&cb); + cb_buffer.extend(avx_as_u8_array(cb)); let crr = _mm256_mullo_epi32(crmulr, r); let crg = _mm256_mullo_epi32(crmulg, g); @@ -108,11 +111,7 @@ macro_rules! ycbcr_image_avx2 { let cr = _mm256_sub_epi32(_mm256_sub_epi32(crr, crg), crb); let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(128 << 16)); let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(0x7FFF)); - let cr = _mm256_srli_epi32(cr, 16); - let cr: [i32; 8] = avx_as_i32_array(cr); - let mut cr: [u8; 8] = cr.map(|x| x as u8); - cr.reverse(); - cr_buffer.extend_from_slice(&cr); + cr_buffer.extend(avx_as_u8_array(cr)); } for _ in 0..self.width() % 8 {