Skip to content

Commit 4aa8b72

Browse files
committed
Faster SIMD approach
1 parent aebfa5b commit 4aa8b72

File tree

2 files changed

+185
-72
lines changed

2 files changed

+185
-72
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ Improvements to solutions are always appreciated. Please see the
5858
## Performance
5959

6060
Benchmarks are measured using the built-in `cargo bench` tool run on an [Apple M2 Max][apple-link].
61-
All 250 solutions from 2024 to 2015 complete sequentially in **511 milliseconds**.
61+
All 250 solutions from 2024 to 2015 complete sequentially in **510 milliseconds**.
6262
Interestingly 86% of the total time is spent on just 9 solutions.
6363
Performance is reasonable even on older hardware, for example a 2011 MacBook Pro with an
6464
[Intel i7-2720QM][intel-link] processor takes 3.5 seconds to run the same 250 solutions.
@@ -67,7 +67,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
6767

6868
| Year | [2015](#2015) | [2016](#2016) | [2017](#2017) | [2018](#2018) | [2019](#2019) | [2020](#2020) | [2021](#2021) | [2022](#2022) | [2023](#2023) | [2024](#2024) |
6969
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
70-
| Benchmark (ms) | 17 | 117 | 82 | 35 | 15 | 220 | 9 | 7 | 5 | 4 |
70+
| Benchmark (ms) | 17 | 117 | 82 | 35 | 15 | 220 | 8 | 7 | 5 | 4 |
7171

7272
## 2024
7373

@@ -190,7 +190,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
190190
| 17 | [Trick Shot](https://adventofcode.com/2021/day/17) | [Source](src/year2021/day17.rs) | 7 |
191191
| 18 | [Snailfish](https://adventofcode.com/2021/day/18) | [Source](src/year2021/day18.rs) | 476 |
192192
| 19 | [Beacon Scanner](https://adventofcode.com/2021/day/19) | [Source](src/year2021/day19.rs) | 615 |
193-
| 20 | [Trench Map](https://adventofcode.com/2021/day/20) | [Source](src/year2021/day20.rs) | 2066 |
193+
| 20 | [Trench Map](https://adventofcode.com/2021/day/20) | [Source](src/year2021/day20.rs) | 511 |
194194
| 21 | [Dirac Dice](https://adventofcode.com/2021/day/21) | [Source](src/year2021/day21.rs) | 278 |
195195
| 22 | [Reactor Reboot](https://adventofcode.com/2021/day/22) | [Source](src/year2021/day22.rs) | 378 |
196196
| 23 | [Amphipod](https://adventofcode.com/2021/day/23) | [Source](src/year2021/day23.rs) | 1714 |

src/year2021/day20.rs

Lines changed: 182 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -12,89 +12,202 @@
1212
//!
1313
//! The algorithm keeps track of the bounds of the expanding square and supplies a `default` value,
1414
//! that in the example case is always zero, but in the real data toggles between zero and one.
15-
pub struct Input {
16-
size: usize,
17-
algorithm: [u8; 512],
18-
pixels: [u8; 40_000],
19-
}
15+
//!
16+
//! A faster SIMD approach processes cells 16 at a time.
17+
use crate::util::grid::*;
18+
use crate::util::point::*;
19+
20+
type Input = (Vec<u8>, Grid<u8>);
2021

2122
pub fn parse(input: &str) -> Input {
22-
// `#` is odd and `.` is even so we can convert to one or zero by bitwise AND with 1.
23-
let bits: Vec<Vec<_>> =
24-
input.lines().map(|line| line.bytes().map(|b| b & 1).collect()).collect();
25-
let size = bits.len() - 2;
26-
let algorithm = bits[0][..512].try_into().unwrap();
27-
28-
// Offset the initial square by 50 cells in both dimensions.
29-
// The square expands by at most one in each step so this is enough room to stay within bounds.
30-
let mut pixels = [0; 40_000];
31-
for (i, row) in bits[2..].iter().enumerate() {
32-
let start = (i + 50) * 200 + 50;
33-
let end = start + size;
34-
pixels[start..end].copy_from_slice(&row[..size]);
35-
}
23+
let (prefix, suffix) = input.split_once("\n\n").unwrap();
24+
25+
let algorithm = prefix.bytes().map(|b| u8::from(b == b'#')).collect();
26+
let grid = Grid::parse(suffix);
3627

37-
Input { size, algorithm, pixels }
28+
(algorithm, grid)
3829
}
3930

40-
pub fn part1(input: &Input) -> usize {
41-
enhance(input, 2)
31+
pub fn part1(input: &Input) -> u32 {
32+
#[cfg(not(feature = "simd"))]
33+
let result = scalar::enhance(input, 2);
34+
35+
#[cfg(feature = "simd")]
36+
let result = simd::enhance(input, 2);
37+
38+
result
4239
}
4340

44-
pub fn part2(input: &Input) -> usize {
45-
enhance(input, 50)
41+
pub fn part2(input: &Input) -> u32 {
42+
#[cfg(not(feature = "simd"))]
43+
let result = scalar::enhance(input, 50);
44+
45+
#[cfg(feature = "simd")]
46+
let result = simd::enhance(input, 50);
47+
48+
result
4649
}
4750

48-
fn enhance(input: &Input, steps: usize) -> usize {
49-
let algorithm = input.algorithm;
50-
let mut pixels = input.pixels;
51-
let mut next = [0; 40_000];
52-
53-
let mut start = 50;
54-
let mut end = 50 + input.size as i32;
55-
let mut default = 0;
56-
57-
for _ in 0..steps {
58-
for y in (start - 1)..(end + 1) {
59-
// If the pixel is within current bounds then return it, or else use the `default`
60-
// edge value specified by the enhancement algorithm.
61-
let helper = |sx, sy, shift| {
62-
let result = if sx < end && sy >= start && sy < end {
63-
pixels[(sy * 200 + sx) as usize] as usize
64-
} else {
65-
default as usize
51+
#[cfg(not(feature = "simd"))]
52+
mod scalar {
53+
use super::*;
54+
55+
pub(super) fn enhance(input: &Input, steps: i32) -> u32 {
56+
let (algorithm, grid) = input;
57+
58+
// Offset the initial square by `step` + 1 buffer cells in both dimensions.
59+
// The square expands by at most one in each step so this is enough room to stay within bounds.
60+
let extra = steps + 1;
61+
let offset = Point::new(extra, extra);
62+
let mut pixels = Grid::new(grid.width + 2 * extra, grid.height + 2 * extra, 0);
63+
64+
for y in 0..grid.height {
65+
for x in 0..grid.width {
66+
let point = Point::new(x, y);
67+
pixels[point + offset] = u8::from(grid[point] == b'#');
68+
}
69+
}
70+
71+
let mut next = pixels.clone();
72+
let mut default = 0;
73+
let mut start = extra;
74+
let mut end = extra + grid.width;
75+
76+
for _ in 0..steps {
77+
for y in (start - 1)..(end + 1) {
78+
// If the pixel is within current bounds then return it, or else use the `default`
79+
// edge value specified by the enhancement algorithm.
80+
let helper = |sx, sy, shift| {
81+
let result = if sx < end && start <= sy && sy < end {
82+
pixels[Point::new(sx, sy)]
83+
} else {
84+
default
85+
};
86+
(result as usize) << shift
6687
};
67-
result << shift
68-
};
69-
70-
// If the edge pixels are 1 then the initial edge will look like
71-
// [##a]
72-
// [##b]
73-
// [##c]
74-
// or 11a11b11c when encoded as an index.
75-
let mut index = if default == 1 { 0b11011011 } else { 0b00000000 };
76-
77-
for x in (start - 1)..(end + 1) {
78-
// Keeps a sliding window of the index, updated as we evaluate the row from
79-
// left to right. Shift the index left by one each turn, updating the values from
80-
// the three new rightmost pixels entering the window.
81-
index = ((index << 1) & 0b110110110)
82-
+ helper(x + 1, y - 1, 6)
83-
+ helper(x + 1, y, 3)
84-
+ helper(x + 1, y + 1, 0);
85-
86-
next[(y * 200 + x) as usize] = algorithm[index];
88+
89+
// If the edge pixels are 1 then the initial edge will look like
90+
// [##a]
91+
// [##b]
92+
// [##c]
93+
// or 11a11b11c when encoded as an index.
94+
let mut index = if default == 1 { 0b11011011 } else { 0b00000000 };
95+
96+
for x in (start - 1)..(end + 1) {
97+
// Keeps a sliding window of the index, updated as we evaluate the row from
98+
// left to right. Shift the index left by one each turn, updating the values from
99+
// the three new rightmost pixels entering the window.
100+
index = ((index << 1) & 0b110110110)
101+
+ helper(x + 1, y - 1, 6)
102+
+ helper(x + 1, y, 3)
103+
+ helper(x + 1, y + 1, 0);
104+
105+
next[Point::new(x, y)] = algorithm[index];
106+
}
107+
}
108+
109+
// Swap grids then calculate the next value for edge pixels beyond the boundary.
110+
(pixels, next) = (next, pixels);
111+
default = if default == 0 { algorithm[0] } else { algorithm[511] };
112+
113+
// Boundaries expand by one each turn
114+
start -= 1;
115+
end += 1;
116+
}
117+
118+
pixels.bytes.iter().map(|&b| b as u32).sum()
119+
}
120+
}
121+
122+
#[cfg(feature = "simd")]
123+
mod simd {
124+
use super::*;
125+
use std::simd::Simd;
126+
use std::simd::num::SimdUint as _;
127+
128+
const LANE_WIDTH: usize = 16;
129+
type Vector = Simd<u16, LANE_WIDTH>;
130+
131+
pub(super) fn enhance(input: &Input, steps: i32) -> u32 {
132+
let (algorithm, grid) = input;
133+
134+
// Offset the initial square by `steps` + 1 buffer cells in both dimensions.
135+
// The square expands by at most one in each step so this is enough room to stay within bounds.
136+
let extra = steps + 1;
137+
let offset = Point::new(extra, extra);
138+
let mut pixels =
139+
Grid::new(grid.width + 2 * extra + LANE_WIDTH as i32, grid.height + 2 * extra, 0);
140+
141+
for y in 0..grid.height {
142+
for x in 0..grid.width {
143+
let point = Point::new(x, y);
144+
pixels[point + offset] = u8::from(grid[point] == b'#');
145+
}
146+
}
147+
148+
let mut next = pixels.clone();
149+
let mut default = 0;
150+
let mut start = extra - 1;
151+
let mut end = extra + grid.width + 1;
152+
153+
for _ in 0..steps {
154+
// Edge pixels on the infinite grid flip flop between on and off.
155+
for y in (start - 1)..(end + 1) {
156+
pixels[Point::new(start - 1, y)] = default;
157+
pixels[Point::new(start, y)] = default;
158+
pixels[Point::new(end - 1, y)] = default;
159+
pixels[Point::new(end, y)] = default;
160+
}
161+
162+
for x in (start..end).step_by(LANE_WIDTH) {
163+
let edge = Simd::splat(if default == 0 { 0b000 } else { 0b111 });
164+
let mut above = edge;
165+
let mut row = edge;
166+
167+
for y in start..end {
168+
let below = if y < end - 2 { from_grid(&pixels, x, y + 1) } else { edge };
169+
170+
let indices = (above << 6) | (row << 3) | below;
171+
above = row;
172+
row = below;
173+
174+
let base = (pixels.width * y + x) as usize;
175+
for (i, j) in indices.to_array().into_iter().enumerate() {
176+
next.bytes[base + i] = algorithm[j as usize];
177+
}
178+
}
87179
}
180+
181+
// Swap grids then calculate the next value for edge pixels beyond the boundary.
182+
(pixels, next) = (next, pixels);
183+
default = if default == 0 { algorithm[0] } else { algorithm[511] };
184+
185+
// Boundaries expand by one each turn.
186+
start -= 1;
187+
end += 1;
88188
}
89189

90-
// Boundaries expand by one each turn
91-
pixels = next;
92-
start -= 1;
93-
end += 1;
190+
// Only count pixels inside the boundary.
191+
let mut result = 0;
94192

95-
// Calculate the next value for edge pixels beyond the boundary.
96-
default = if default == 0 { algorithm[0] } else { algorithm[511] };
193+
for y in 1..end - 1 {
194+
for x in 1..end - 1 {
195+
result += pixels[Point::new(x, y)] as u32;
196+
}
197+
}
198+
199+
result
97200
}
98201

99-
pixels.iter().filter(|&&p| p == 1).count()
202+
#[inline]
203+
fn from_grid(grid: &Grid<u8>, x: i32, y: i32) -> Vector {
204+
let index = (grid.width * y + x) as usize;
205+
206+
let row = Simd::from_slice(&grid.bytes[index..]);
207+
let left = row.shift_elements_right::<1>(grid[Point::new(x - 1, y)]);
208+
let right = row.shift_elements_left::<1>(grid[Point::new(x + LANE_WIDTH as i32, y)]);
209+
210+
let result = (left << 2) | (row << 1) | right;
211+
result.cast()
212+
}
100213
}

0 commit comments

Comments
 (0)