Faster SIMD approach

maneatingape · maneatingape · commit 4aa8b728119f · 2025-10-11T19:31:18.000+01:00
diff --git a/README.md b/README.md
@@ -58,7 +58,7 @@ Improvements to solutions are always appreciated. Please see the
 ## Performance
 
 Benchmarks are measured using the built-in `cargo bench` tool run on an [Apple M2 Max][apple-link].
-All 250 solutions from 2024 to 2015 complete sequentially in **511 milliseconds**.
+All 250 solutions from 2024 to 2015 complete sequentially in **510 milliseconds**.
 Interestingly 86% of the total time is spent on just 9 solutions.
 Performance is reasonable even on older hardware, for example a 2011 MacBook Pro with an
 [Intel i7-2720QM][intel-link] processor takes 3.5 seconds to run the same 250 solutions.
@@ -67,7 +67,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
 
 | Year | [2015](#2015) | [2016](#2016) | [2017](#2017) | [2018](#2018) | [2019](#2019) | [2020](#2020) | [2021](#2021) | [2022](#2022) | [2023](#2023) | [2024](#2024) |
 | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| Benchmark (ms) | 17 | 117 | 82 | 35 | 15 | 220 | 9 | 7 | 5 | 4 |
+| Benchmark (ms) | 17 | 117 | 82 | 35 | 15 | 220 | 8 | 7 | 5 | 4 |
 
 ## 2024
 
@@ -190,7 +190,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
 | 17 | [Trick Shot](https://adventofcode.com/2021/day/17) | [Source](src/year2021/day17.rs) | 7 |
 | 18 | [Snailfish](https://adventofcode.com/2021/day/18) | [Source](src/year2021/day18.rs) | 476 |
 | 19 | [Beacon Scanner](https://adventofcode.com/2021/day/19) | [Source](src/year2021/day19.rs) | 615 |
-| 20 | [Trench Map](https://adventofcode.com/2021/day/20) | [Source](src/year2021/day20.rs) | 2066 |
+| 20 | [Trench Map](https://adventofcode.com/2021/day/20) | [Source](src/year2021/day20.rs) | 511 |
 | 21 | [Dirac Dice](https://adventofcode.com/2021/day/21) | [Source](src/year2021/day21.rs) | 278 |
 | 22 | [Reactor Reboot](https://adventofcode.com/2021/day/22) | [Source](src/year2021/day22.rs) | 378 |
 | 23 | [Amphipod](https://adventofcode.com/2021/day/23) | [Source](src/year2021/day23.rs) | 1714 |
diff --git a/src/year2021/day20.rs b/src/year2021/day20.rs
@@ -12,89 +12,202 @@
 //!
 //! The algorithm keeps track of the bounds of the expanding square and supplies a `default` value,
 //! that in the example case is always zero, but in the real data toggles between zero and one.
-pub struct Input {
-    size: usize,
-    algorithm: [u8; 512],
-    pixels: [u8; 40_000],
-}
+//!
+//! A faster SIMD approach processes cells 16 at a time.
+use crate::util::grid::*;
+use crate::util::point::*;
+
+type Input = (Vec<u8>, Grid<u8>);
 
 pub fn parse(input: &str) -> Input {
-    // `#` is odd and `.` is even so we can convert to one or zero by bitwise AND with 1.
-    let bits: Vec<Vec<_>> =
-        input.lines().map(|line| line.bytes().map(|b| b & 1).collect()).collect();
-    let size = bits.len() - 2;
-    let algorithm = bits[0][..512].try_into().unwrap();
-
-    // Offset the initial square by 50 cells in both dimensions.
-    // The square expands by at most one in each step so this is enough room to stay within bounds.
-    let mut pixels = [0; 40_000];
-    for (i, row) in bits[2..].iter().enumerate() {
-        let start = (i + 50) * 200 + 50;
-        let end = start + size;
-        pixels[start..end].copy_from_slice(&row[..size]);
-    }
+    let (prefix, suffix) = input.split_once("\n\n").unwrap();
+
+    let algorithm = prefix.bytes().map(|b| u8::from(b == b'#')).collect();
+    let grid = Grid::parse(suffix);
 
-    Input { size, algorithm, pixels }
+    (algorithm, grid)
 }
 
-pub fn part1(input: &Input) -> usize {
-    enhance(input, 2)
+pub fn part1(input: &Input) -> u32 {
+    #[cfg(not(feature = "simd"))]
+    let result = scalar::enhance(input, 2);
+
+    #[cfg(feature = "simd")]
+    let result = simd::enhance(input, 2);
+
+    result
 }
 
-pub fn part2(input: &Input) -> usize {
-    enhance(input, 50)
+pub fn part2(input: &Input) -> u32 {
+    #[cfg(not(feature = "simd"))]
+    let result = scalar::enhance(input, 50);
+
+    #[cfg(feature = "simd")]
+    let result = simd::enhance(input, 50);
+
+    result
 }
 
-fn enhance(input: &Input, steps: usize) -> usize {
-    let algorithm = input.algorithm;
-    let mut pixels = input.pixels;
-    let mut next = [0; 40_000];
-
-    let mut start = 50;
-    let mut end = 50 + input.size as i32;
-    let mut default = 0;
-
-    for _ in 0..steps {
-        for y in (start - 1)..(end + 1) {
-            // If the pixel is within current bounds then return it, or else use the `default`
-            // edge value specified by the enhancement algorithm.
-            let helper = |sx, sy, shift| {
-                let result = if sx < end && sy >= start && sy < end {
-                    pixels[(sy * 200 + sx) as usize] as usize
-                } else {
-                    default as usize
+#[cfg(not(feature = "simd"))]
+mod scalar {
+    use super::*;
+
+    pub(super) fn enhance(input: &Input, steps: i32) -> u32 {
+        let (algorithm, grid) = input;
+
+        // Offset the initial square by `step` + 1 buffer cells in both dimensions.
+        // The square expands by at most one in each step so this is enough room to stay within bounds.
+        let extra = steps + 1;
+        let offset = Point::new(extra, extra);
+        let mut pixels = Grid::new(grid.width + 2 * extra, grid.height + 2 * extra, 0);
+
+        for y in 0..grid.height {
+            for x in 0..grid.width {
+                let point = Point::new(x, y);
+                pixels[point + offset] = u8::from(grid[point] == b'#');
+            }
+        }
+
+        let mut next = pixels.clone();
+        let mut default = 0;
+        let mut start = extra;
+        let mut end = extra + grid.width;
+
+        for _ in 0..steps {
+            for y in (start - 1)..(end + 1) {
+                // If the pixel is within current bounds then return it, or else use the `default`
+                // edge value specified by the enhancement algorithm.
+                let helper = |sx, sy, shift| {
+                    let result = if sx < end && start <= sy && sy < end {
+                        pixels[Point::new(sx, sy)]
+                    } else {
+                        default
+                    };
+                    (result as usize) << shift
                 };
-                result << shift
-            };
-
-            // If the edge pixels are 1 then the initial edge will look like
-            // [##a]
-            // [##b]
-            // [##c]
-            // or 11a11b11c when encoded as an index.
-            let mut index = if default == 1 { 0b11011011 } else { 0b00000000 };
-
-            for x in (start - 1)..(end + 1) {
-                // Keeps a sliding window of the index, updated as we evaluate the row from
-                // left to right. Shift the index left by one each turn, updating the values from
-                // the three new rightmost pixels entering the window.
-                index = ((index << 1) & 0b110110110)
-                    + helper(x + 1, y - 1, 6)
-                    + helper(x + 1, y, 3)
-                    + helper(x + 1, y + 1, 0);
-
-                next[(y * 200 + x) as usize] = algorithm[index];
+
+                // If the edge pixels are 1 then the initial edge will look like
+                // [##a]
+                // [##b]
+                // [##c]
+                // or 11a11b11c when encoded as an index.
+                let mut index = if default == 1 { 0b11011011 } else { 0b00000000 };
+
+                for x in (start - 1)..(end + 1) {
+                    // Keeps a sliding window of the index, updated as we evaluate the row from
+                    // left to right. Shift the index left by one each turn, updating the values from
+                    // the three new rightmost pixels entering the window.
+                    index = ((index << 1) & 0b110110110)
+                        + helper(x + 1, y - 1, 6)
+                        + helper(x + 1, y, 3)
+                        + helper(x + 1, y + 1, 0);
+
+                    next[Point::new(x, y)] = algorithm[index];
+                }
+            }
+
+            // Swap grids then calculate the next value for edge pixels beyond the boundary.
+            (pixels, next) = (next, pixels);
+            default = if default == 0 { algorithm[0] } else { algorithm[511] };
+
+            // Boundaries expand by one each turn
+            start -= 1;
+            end += 1;
+        }
+
+        pixels.bytes.iter().map(|&b| b as u32).sum()
+    }
+}
+
+#[cfg(feature = "simd")]
+mod simd {
+    use super::*;
+    use std::simd::Simd;
+    use std::simd::num::SimdUint as _;
+
+    const LANE_WIDTH: usize = 16;
+    type Vector = Simd<u16, LANE_WIDTH>;
+
+    pub(super) fn enhance(input: &Input, steps: i32) -> u32 {
+        let (algorithm, grid) = input;
+
+        // Offset the initial square by `steps` + 1 buffer cells in both dimensions.
+        // The square expands by at most one in each step so this is enough room to stay within bounds.
+        let extra = steps + 1;
+        let offset = Point::new(extra, extra);
+        let mut pixels =
+            Grid::new(grid.width + 2 * extra + LANE_WIDTH as i32, grid.height + 2 * extra, 0);
+
+        for y in 0..grid.height {
+            for x in 0..grid.width {
+                let point = Point::new(x, y);
+                pixels[point + offset] = u8::from(grid[point] == b'#');
+            }
+        }
+
+        let mut next = pixels.clone();
+        let mut default = 0;
+        let mut start = extra - 1;
+        let mut end = extra + grid.width + 1;
+
+        for _ in 0..steps {
+            // Edge pixels on the infinite grid flip flop between on and off.
+            for y in (start - 1)..(end + 1) {
+                pixels[Point::new(start - 1, y)] = default;
+                pixels[Point::new(start, y)] = default;
+                pixels[Point::new(end - 1, y)] = default;
+                pixels[Point::new(end, y)] = default;
+            }
+
+            for x in (start..end).step_by(LANE_WIDTH) {
+                let edge = Simd::splat(if default == 0 { 0b000 } else { 0b111 });
+                let mut above = edge;
+                let mut row = edge;
+
+                for y in start..end {
+                    let below = if y < end - 2 { from_grid(&pixels, x, y + 1) } else { edge };
+
+                    let indices = (above << 6) | (row << 3) | below;
+                    above = row;
+                    row = below;
+
+                    let base = (pixels.width * y + x) as usize;
+                    for (i, j) in indices.to_array().into_iter().enumerate() {
+                        next.bytes[base + i] = algorithm[j as usize];
+                    }
+                }
             }
+
+            // Swap grids then calculate the next value for edge pixels beyond the boundary.
+            (pixels, next) = (next, pixels);
+            default = if default == 0 { algorithm[0] } else { algorithm[511] };
+
+            // Boundaries expand by one each turn.
+            start -= 1;
+            end += 1;
         }
 
-        // Boundaries expand by one each turn
-        pixels = next;
-        start -= 1;
-        end += 1;
+        // Only count pixels inside the boundary.
+        let mut result = 0;
 
-        // Calculate the next value for edge pixels beyond the boundary.
-        default = if default == 0 { algorithm[0] } else { algorithm[511] };
+        for y in 1..end - 1 {
+            for x in 1..end - 1 {
+                result += pixels[Point::new(x, y)] as u32;
+            }
+        }
+
+        result
     }
 
-    pixels.iter().filter(|&&p| p == 1).count()
+    #[inline]
+    fn from_grid(grid: &Grid<u8>, x: i32, y: i32) -> Vector {
+        let index = (grid.width * y + x) as usize;
+
+        let row = Simd::from_slice(&grid.bytes[index..]);
+        let left = row.shift_elements_right::<1>(grid[Point::new(x - 1, y)]);
+        let right = row.shift_elements_left::<1>(grid[Point::new(x + LANE_WIDTH as i32, y)]);
+
+        let result = (left << 2) | (row << 1) | right;
+        result.cast()
+    }
 }