Skip to content

Commit 5a6963d

Browse files
committed
Faster approach with flatter nested structure and SIMD search of leaf elements
1 parent 3e10f18 commit 5a6963d

File tree

2 files changed

+79
-42
lines changed

2 files changed

+79
-42
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Advent of Code [![checks-badge]][checks-link] [![docs-badge]][docs-link]
22

33
Blazing fast Rust solutions for every [Advent of Code] puzzle from 2015 to 2024, taking
4-
**502 milliseconds** to solve all 500 stars. Each solution is carefully optimized for performance
4+
**501 milliseconds** to solve all 500 stars. Each solution is carefully optimized for performance
55
while ensuring the code remains concise, readable, and idiomatic.
66

77
## Features
@@ -67,7 +67,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
6767

6868
| Year | [2015](#2015) | [2016](#2016) | [2017](#2017) | [2018](#2018) | [2019](#2019) | [2020](#2020) | [2021](#2021) | [2022](#2022) | [2023](#2023) | [2024](#2024) |
6969
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
70-
| Benchmark (ms) | 15 | 111 | 82 | 35 | 15 | 220 | 8 | 7 | 5 | 4 |
70+
| Benchmark (ms) | 15 | 111 | 82 | 35 | 15 | 220 | 8 | 6 | 5 | 4 |
7171

7272
## 2024
7373

@@ -158,7 +158,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
158158
| 17 | [Pyroclastic Flow](https://adventofcode.com/2022/day/17) | [Source](src/year2022/day17.rs) | 71 |
159159
| 18 | [Boiling Boulders](https://adventofcode.com/2022/day/18) | [Source](src/year2022/day18.rs) | 52 |
160160
| 19 | [Not Enough Minerals](https://adventofcode.com/2022/day/19) | [Source](src/year2022/day19.rs) | 74 |
161-
| 20 | [Grove Positioning System](https://adventofcode.com/2022/day/20) | [Source](src/year2022/day20.rs) | 3785 |
161+
| 20 | [Grove Positioning System](https://adventofcode.com/2022/day/20) | [Source](src/year2022/day20.rs) | 2685 |
162162
| 21 | [Monkey Math](https://adventofcode.com/2022/day/21) | [Source](src/year2022/day21.rs) | 64 |
163163
| 22 | [Monkey Map](https://adventofcode.com/2022/day/22) | [Source](src/year2022/day22.rs) | 61 |
164164
| 23 | [Unstable Diffusion](https://adventofcode.com/2022/day/23) | [Source](src/year2022/day23.rs) | 1521 |

src/year2022/day20.rs

Lines changed: 76 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,13 @@
1515
//! although perhaps adding [balancing rotations](https://en.wikipedia.org/wiki/Tree_rotation)
1616
//! to the tree would make it faster.
1717
use crate::util::parse::*;
18+
use std::array::from_fn;
19+
use std::iter::repeat_n;
20+
21+
struct PaddedVec {
22+
size: usize,
23+
vec: Vec<u16>,
24+
}
1825

1926
pub fn parse(input: &str) -> Vec<i64> {
2027
input.iter_signed().collect()
@@ -32,72 +39,70 @@ fn decrypt(input: &[i64], key: i64, rounds: usize) -> i64 {
3239
// Important nuance, size is one less because we don't consider the moving number.
3340
let size = input.len() - 1;
3441
// Another nuance, input contain duplicate numbers, so use index to refer to each number uniquely.
35-
let indices: Vec<_> = (0..input.len()).collect();
42+
let indices: Vec<_> = (0..input.len() as u16).collect();
3643
// Pre-process the numbers, coverting any negative indices to positive indices that will wrap.
3744
// For example, -1 becomes 4998.
3845
let numbers: Vec<_> =
39-
input.iter().map(|n| (n * key).rem_euclid(size as i64) as usize).collect();
40-
41-
// Store first and second level indices.
42-
let mut lookup = Vec::new();
43-
// Triple nested vec of numbers.
44-
let mut mixed = Vec::new();
45-
// Size of each first level element for convenience.
46-
let mut skip = Vec::new();
47-
48-
// Break 5000 numbers into roughly equals chunks at each level. 289 = 17 * 17.
49-
for first in indices.chunks(289) {
50-
let mut outer = Vec::new();
51-
52-
for second in first.chunks(17) {
53-
// Initial first and second level indices.
54-
(0..second.len()).for_each(|_| lookup.push((mixed.len(), outer.len())));
55-
56-
// Leave some extra room, as mixing won't balance evenly.
57-
let mut inner = Vec::with_capacity(100);
58-
inner.extend_from_slice(second);
59-
60-
outer.push(inner);
61-
}
62-
63-
mixed.push(outer);
64-
skip.push(first.len());
46+
input.iter().map(|&n| (n * key).rem_euclid(size as i64) as usize).collect();
47+
// Store location of each number within `mixed` for faster lookup.
48+
let mut lookup = Vec::with_capacity(input.len());
49+
// Size of each block of 16 elements for faster lookup.
50+
let mut skip = [0; 16];
51+
// Break 5000 numbers into roughly equals chunks.
52+
let mut mixed: [_; 256] = from_fn(|_| PaddedVec { size: 0, vec: Vec::with_capacity(128) });
53+
54+
for (second, slice) in indices.chunks(input.len().div_ceil(256)).enumerate() {
55+
let size = slice.len();
56+
57+
mixed[second].size = size;
58+
mixed[second].vec.resize(size.next_multiple_of(64), 0);
59+
mixed[second].vec[..size].copy_from_slice(slice);
60+
61+
lookup.extend(repeat_n(second, size));
62+
skip[second / 16] += size;
6563
}
6664

6765
for _ in 0..rounds {
6866
'mix: for index in 0..input.len() {
6967
// Quickly find the leaf vector storing the number.
7068
let number = numbers[index];
71-
let (first, second) = lookup[index];
69+
let second = lookup[index];
70+
let first = second / 16;
71+
7272
// Third level changes as other numbers are added and removed,
7373
// so needs to be checked each time.
74-
let third = mixed[first][second].iter().position(|&i| i == index).unwrap();
74+
let third = position(&mixed[second], index as u16);
7575

7676
// Find the offset of the number by adding the size of all previous `vec`s.
7777
let position = third
7878
+ skip[..first].iter().sum::<usize>()
79-
+ mixed[first][..second].iter().map(Vec::len).sum::<usize>();
79+
+ mixed[16 * first..second].iter().map(|v| v.size).sum::<usize>();
8080
// Update our position, wrapping around if necessary.
8181
let mut next = (position + number) % size;
8282

8383
// Remove number from current leaf vector, also updating the first level size.
84-
mixed[first][second].remove(third);
84+
mixed[second].size -= 1;
85+
mixed[second].vec.remove(third);
86+
mixed[second].vec.push(0);
8587
skip[first] -= 1;
8688

8789
// Find our new destination, by checking `vec`s in order until the total elements
8890
// are greater than our new index.
89-
for (first, outer) in mixed.iter_mut().enumerate() {
91+
for (first, outer) in mixed.chunks_exact_mut(16).enumerate() {
9092
if next > skip[first] {
9193
next -= skip[first];
9294
} else {
9395
for (second, inner) in outer.iter_mut().enumerate() {
94-
if next > inner.len() {
95-
next -= inner.len();
96+
if next > inner.size {
97+
next -= inner.size;
9698
} else {
9799
// Insert number into its new home.
98-
inner.insert(next, index);
100+
inner.size += 1;
101+
inner.vec.insert(next, index as u16);
102+
inner.vec.resize(inner.size.next_multiple_of(64), 0);
103+
// Update location.
99104
skip[first] += 1;
100-
lookup[index] = (first, second);
105+
lookup[index] = 16 * first + second;
101106
continue 'mix;
102107
}
103108
}
@@ -106,12 +111,44 @@ fn decrypt(input: &[i64], key: i64, rounds: usize) -> i64 {
106111
}
107112
}
108113

109-
let indices: Vec<_> = mixed.into_iter().flatten().flatten().collect();
110-
let zeroth = indices.iter().position(|&i| input[i] == 0).unwrap();
114+
let indices: Vec<_> =
115+
mixed.into_iter().flat_map(|pv| pv.vec.into_iter().take(pv.size)).collect();
116+
let zeroth = indices.iter().position(|&i| input[i as usize] == 0).unwrap();
111117

112118
[1000, 2000, 3000]
113119
.iter()
114120
.map(|offset| (zeroth + offset) % indices.len())
115-
.map(|index| input[indices[index]] * key)
121+
.map(|index| input[indices[index] as usize] * key)
116122
.sum()
117123
}
124+
125+
/// The compiler optimizes the position search when the size of the chunk is known.
126+
#[cfg(not(feature = "simd"))]
127+
#[inline]
128+
fn position(haystack: &PaddedVec, needle: u16) -> usize {
129+
for (base, slice) in haystack.vec.chunks_exact(64).enumerate() {
130+
if let Some(offset) = slice.iter().position(|&i| i == needle) {
131+
return 64 * base + offset;
132+
}
133+
}
134+
135+
unreachable!()
136+
}
137+
138+
/// Search 64 lanes simultaneously.
139+
#[cfg(feature = "simd")]
140+
#[inline]
141+
fn position(haystack: &PaddedVec, needle: u16) -> usize {
142+
use std::simd::cmp::SimdPartialEq as _;
143+
use std::simd::*;
144+
145+
for (base, slice) in haystack.vec.chunks_exact(64).enumerate() {
146+
if let Some(offset) =
147+
Simd::<u16, 64>::from_slice(slice).simd_eq(Simd::splat(needle)).first_set()
148+
{
149+
return 64 * base + offset;
150+
}
151+
}
152+
153+
unreachable!()
154+
}

0 commit comments

Comments
 (0)