Skip to content

Commit 2f43d21

Browse files
committed
Speed up MD5 hashing by skipping gather operation where possible
1 parent 8c933a5 commit 2f43d21

File tree

6 files changed

+76
-83
lines changed

6 files changed

+76
-83
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ Improvements to solutions are always appreciated. Please see the
5858
## Performance
5959

6060
Benchmarks are measured using the built-in `cargo bench` tool run on an [Apple M2 Max][apple-link].
61-
All 250 solutions from 2024 to 2015 complete sequentially in **510 milliseconds**.
61+
All 250 solutions from 2024 to 2015 complete sequentially in **504 milliseconds**.
6262
Interestingly 86% of the total time is spent on just 9 solutions.
6363
Performance is reasonable even on older hardware, for example a 2011 MacBook Pro with an
6464
[Intel i7-2720QM][intel-link] processor takes 3.5 seconds to run the same 250 solutions.
@@ -67,7 +67,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
6767

6868
| Year | [2015](#2015) | [2016](#2016) | [2017](#2017) | [2018](#2018) | [2019](#2019) | [2020](#2020) | [2021](#2021) | [2022](#2022) | [2023](#2023) | [2024](#2024) |
6969
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
70-
| Benchmark (ms) | 17 | 117 | 82 | 35 | 15 | 220 | 8 | 7 | 5 | 4 |
70+
| Benchmark (ms) | 17 | 111 | 82 | 35 | 15 | 220 | 8 | 7 | 5 | 4 |
7171

7272
## 2024
7373

@@ -335,7 +335,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
335335
| 2 | [Bathroom Security](https://adventofcode.com/2016/day/2) | [Source](src/year2016/day02.rs) | 29 |
336336
| 3 | [Squares With Three Sides](https://adventofcode.com/2016/day/3) | [Source](src/year2016/day03.rs) | 24 |
337337
| 4 | [Security Through Obscurity](https://adventofcode.com/2016/day/4) | [Source](src/year2016/day04.rs) | 79 |
338-
| 5 | [How About a Nice Game of Chess?](https://adventofcode.com/2016/day/5) | [Source](src/year2016/day05.rs) | 35000 |
338+
| 5 | [How About a Nice Game of Chess?](https://adventofcode.com/2016/day/5) | [Source](src/year2016/day05.rs) | 34000 |
339339
| 6 | [Signals and Noise](https://adventofcode.com/2016/day/6) | [Source](src/year2016/day06.rs) | 3 |
340340
| 7 | [Internet Protocol Version 7](https://adventofcode.com/2016/day/7) | [Source](src/year2016/day07.rs) | 364 |
341341
| 8 | [Two-Factor Authentication](https://adventofcode.com/2016/day/8) | [Source](src/year2016/day08.rs) | 9 |
@@ -344,7 +344,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
344344
| 11 | [Radioisotope Thermoelectric Generators](https://adventofcode.com/2016/day/11) | [Source](src/year2016/day11.rs) | 719 |
345345
| 12 | [Leonardo's Monorail](https://adventofcode.com/2016/day/12) | [Source](src/year2016/day12.rs) | 1 |
346346
| 13 | [A Maze of Twisty Little Cubicles](https://adventofcode.com/2016/day/13) | [Source](src/year2016/day13.rs) | 3 |
347-
| 14 | [One-Time Pad](https://adventofcode.com/2016/day/14) | [Source](src/year2016/day14.rs) | 77000 |
347+
| 14 | [One-Time Pad](https://adventofcode.com/2016/day/14) | [Source](src/year2016/day14.rs) | 72000 |
348348
| 15 | [Timing is Everything](https://adventofcode.com/2016/day/15) | [Source](src/year2016/day15.rs) | 1 |
349349
| 16 | [Dragon Checksum](https://adventofcode.com/2016/day/16) | [Source](src/year2016/day16.rs) | 1 |
350350
| 17 | [Two Steps Forward](https://adventofcode.com/2016/day/17) | [Source](src/year2016/day17.rs) | 3858 |

src/util/md5.rs

Lines changed: 37 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,11 @@
1111
//! [`#[inline]`](https://doc.rust-lang.org/reference/attributes/codegen.html#the-inline-attribute).
1212
//!
1313
//! An optional SIMD variant that computes multiple hashes in parallel is also implemented.
14-
1514
pub fn buffer_size(n: usize) -> usize {
1615
(n + 9).next_multiple_of(64)
1716
}
1817

19-
pub fn hash(mut buffer: &mut [u8], size: usize) -> (u32, u32, u32, u32) {
18+
pub fn hash(buffer: &mut [u8], size: usize) -> [u32; 4] {
2019
let end = buffer.len() - 8;
2120
let bits = size * 8;
2221

@@ -29,11 +28,8 @@ pub fn hash(mut buffer: &mut [u8], size: usize) -> (u32, u32, u32, u32) {
2928
let mut c0: u32 = 0x98badcfe;
3029
let mut d0: u32 = 0x10325476;
3130

32-
while !buffer.is_empty() {
33-
let (prefix, suffix) = buffer.split_at_mut(64);
34-
buffer = suffix;
35-
36-
for (i, chunk) in prefix.chunks_exact(4).enumerate() {
31+
for block in buffer.chunks_exact(64) {
32+
for (i, chunk) in block.chunks_exact(4).enumerate() {
3733
m[i] = u32::from_le_bytes(chunk.try_into().unwrap());
3834
}
3935

@@ -116,7 +112,7 @@ pub fn hash(mut buffer: &mut [u8], size: usize) -> (u32, u32, u32, u32) {
116112
d0 = d0.wrapping_add(d);
117113
}
118114

119-
(a0.to_be(), b0.to_be(), c0.to_be(), d0.to_be())
115+
[a0.to_be(), b0.to_be(), c0.to_be(), d0.to_be()]
120116
}
121117

122118
#[inline]
@@ -150,69 +146,62 @@ fn common(f: u32, a: u32, b: u32, m: u32, s: u32, k: u32) -> u32 {
150146

151147
#[cfg(feature = "simd")]
152148
pub mod simd {
153-
use std::array;
149+
use std::array::from_fn;
154150
use std::simd::num::SimdUint as _;
155151
use std::simd::{LaneCount, Simd, SupportedLaneCount};
156152

157153
#[inline]
158154
#[expect(clippy::too_many_lines)]
159-
pub fn hash<const N: usize>(
160-
buffers: &mut [[u8; 64]],
161-
size: usize,
162-
) -> ([u32; N], [u32; N], [u32; N], [u32; N])
155+
pub fn hash_fixed<const N: usize>(buffers: &mut [[u8; 64]; N], size: usize) -> [[u32; N]; 4]
163156
where
164157
LaneCount<N>: SupportedLaneCount,
165158
{
166159
// Assume all buffers are the same size.
167-
let end = 64 - 8;
168-
let bits = size * 8;
169-
170160
for buffer in buffers.iter_mut() {
171161
buffer[size] = 0x80;
172-
buffer[end..].copy_from_slice(&bits.to_le_bytes());
173162
}
174163

175-
let mut a0: Simd<u32, N> = Simd::splat(0x67452301);
176-
let mut b0: Simd<u32, N> = Simd::splat(0xefcdab89);
177-
let mut c0: Simd<u32, N> = Simd::splat(0x98badcfe);
178-
let mut d0: Simd<u32, N> = Simd::splat(0x10325476);
164+
let mut a0 = Simd::splat(0x67452301);
165+
let mut b0 = Simd::splat(0xefcdab89);
166+
let mut c0 = Simd::splat(0x98badcfe);
167+
let mut d0 = Simd::splat(0x10325476);
179168

180169
let mut a = a0;
181170
let mut b = b0;
182171
let mut c = c0;
183172
let mut d = d0;
184173

185-
let m0 = message(buffers, 0);
174+
let m0 = message(buffers, 0, size);
186175
a = round1(a, b, c, d, m0, 7, 0xd76aa478);
187-
let m1 = message(buffers, 1);
176+
let m1 = message(buffers, 4, size);
188177
d = round1(d, a, b, c, m1, 12, 0xe8c7b756);
189-
let m2 = message(buffers, 2);
178+
let m2 = message(buffers, 8, size);
190179
c = round1(c, d, a, b, m2, 17, 0x242070db);
191-
let m3 = message(buffers, 3);
180+
let m3 = message(buffers, 12, size);
192181
b = round1(b, c, d, a, m3, 22, 0xc1bdceee);
193-
let m4 = message(buffers, 4);
182+
let m4 = message(buffers, 16, size);
194183
a = round1(a, b, c, d, m4, 7, 0xf57c0faf);
195-
let m5 = message(buffers, 5);
184+
let m5 = message(buffers, 20, size);
196185
d = round1(d, a, b, c, m5, 12, 0x4787c62a);
197-
let m6 = message(buffers, 6);
186+
let m6 = message(buffers, 24, size);
198187
c = round1(c, d, a, b, m6, 17, 0xa8304613);
199-
let m7 = message(buffers, 7);
188+
let m7 = message(buffers, 28, size);
200189
b = round1(b, c, d, a, m7, 22, 0xfd469501);
201-
let m8 = message(buffers, 8);
190+
let m8 = message(buffers, 32, size);
202191
a = round1(a, b, c, d, m8, 7, 0x698098d8);
203-
let m9 = message(buffers, 9);
192+
let m9 = message(buffers, 36, size);
204193
d = round1(d, a, b, c, m9, 12, 0x8b44f7af);
205-
let m10 = message(buffers, 10);
194+
let m10 = message(buffers, 40, size);
206195
c = round1(c, d, a, b, m10, 17, 0xffff5bb1);
207-
let m11 = message(buffers, 11);
196+
let m11 = message(buffers, 44, size);
208197
b = round1(b, c, d, a, m11, 22, 0x895cd7be);
209-
let m12 = message(buffers, 12);
198+
let m12 = message(buffers, 48, size);
210199
a = round1(a, b, c, d, m12, 7, 0x6b901122);
211-
let m13 = message(buffers, 13);
200+
let m13 = message(buffers, 52, size);
212201
d = round1(d, a, b, c, m13, 12, 0xfd987193);
213-
let m14 = message(buffers, 14);
202+
let m14 = Simd::splat(size as u32 * 8);
214203
c = round1(c, d, a, b, m14, 17, 0xa679438e);
215-
let m15 = message(buffers, 15);
204+
let m15 = Simd::splat(0);
216205
b = round1(b, c, d, a, m15, 22, 0x49b40821);
217206

218207
a = round2(a, b, c, d, m1, 5, 0xf61e2562);
@@ -271,25 +260,27 @@ pub mod simd {
271260
c0 += c;
272261
d0 += d;
273262

274-
(
263+
[
275264
a0.swap_bytes().to_array(),
276265
b0.swap_bytes().to_array(),
277266
c0.swap_bytes().to_array(),
278267
d0.swap_bytes().to_array(),
279-
)
268+
]
280269
}
281270

282271
#[inline]
283-
fn message<const N: usize>(buffers: &mut [[u8; 64]], i: usize) -> Simd<u32, N>
272+
fn message<const N: usize>(buffers: &[[u8; 64]; N], i: usize, size: usize) -> Simd<u32, N>
284273
where
285274
LaneCount<N>: SupportedLaneCount,
286275
{
287-
let start = 4 * i;
288-
let end = start + 4;
289-
Simd::from_array(array::from_fn(|lane| {
290-
let slice = &buffers[lane][start..end];
291-
u32::from_le_bytes(slice.try_into().unwrap())
292-
}))
276+
if i > size {
277+
Simd::splat(0)
278+
} else {
279+
Simd::from_array(from_fn(|lane| {
280+
let slice = &buffers[lane][i..i + 4];
281+
u32::from_le_bytes(slice.try_into().unwrap())
282+
}))
283+
}
293284
}
294285

295286
#[inline]

src/year2015/day04.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ fn format_string(prefix: &str, n: u32) -> ([u8; 64], usize) {
7272
}
7373

7474
fn check_hash(buffer: &mut [u8], size: usize, n: u32, shared: &Shared) {
75-
let (result, ..) = hash(buffer, size);
75+
let [result, ..] = hash(buffer, size);
7676

7777
if result & 0xffffff00 == 0 {
7878
shared.second.fetch_min(n, Ordering::Relaxed);
@@ -101,12 +101,12 @@ fn worker(shared: &Shared) {
101101
#[cfg(feature = "simd")]
102102
mod simd {
103103
use super::*;
104-
use crate::util::md5::simd::hash;
104+
use crate::util::md5::simd::hash_fixed;
105105
use std::simd::{LaneCount, SupportedLaneCount};
106106

107107
#[expect(clippy::needless_range_loop)]
108108
fn check_hash_simd<const N: usize>(
109-
buffers: &mut [[u8; 64]],
109+
buffers: &mut [[u8; 64]; N],
110110
size: usize,
111111
start: u32,
112112
offset: u32,
@@ -122,7 +122,7 @@ mod simd {
122122
buffers[i][size - 1] = b'0' + (n % 10) as u8;
123123
}
124124

125-
let (result, ..) = hash::<N>(buffers, size);
125+
let [result, ..] = hash_fixed(buffers, size);
126126

127127
for i in 0..N {
128128
if result[i] & 0xffffff00 == 0 {
@@ -137,13 +137,14 @@ mod simd {
137137
pub(super) fn worker(shared: &Shared) {
138138
while let Some(start) = shared.iter.next() {
139139
let (prefix, size) = format_string(&shared.prefix, start);
140-
let mut buffers = [prefix; 32];
140+
let buffers = &mut [prefix; 32];
141141

142142
for offset in (0..992).step_by(32) {
143-
check_hash_simd::<32>(&mut buffers, size, start, offset, shared);
143+
check_hash_simd(buffers, size, start, offset, shared);
144144
}
145145

146-
check_hash_simd::<8>(&mut buffers, size, start, 992, shared);
146+
let buffers = &mut [prefix; 8];
147+
check_hash_simd(buffers, size, start, 992, shared);
147148
}
148149
}
149150
}

src/year2016/day05.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ fn format_string(prefix: &str, n: u32) -> ([u8; 64], usize) {
7878
}
7979

8080
fn check_hash(buffer: &mut [u8], size: usize, n: u32, shared: &Shared) {
81-
let (result, ..) = hash(buffer, size);
81+
let [result, ..] = hash(buffer, size);
8282

8383
if result & 0xfffff000 == 0 {
8484
let mut exclusive = shared.mutex.lock().unwrap();
@@ -111,12 +111,12 @@ fn worker(shared: &Shared) {
111111
#[cfg(feature = "simd")]
112112
mod simd {
113113
use super::*;
114-
use crate::util::md5::simd::hash;
114+
use crate::util::md5::simd::hash_fixed;
115115
use std::simd::{LaneCount, SupportedLaneCount};
116116

117117
#[expect(clippy::needless_range_loop)]
118118
fn check_hash_simd<const N: usize>(
119-
buffers: &mut [[u8; 64]],
119+
buffers: &mut [[u8; 64]; N],
120120
size: usize,
121121
start: u32,
122122
offset: u32,
@@ -132,7 +132,7 @@ mod simd {
132132
buffers[i][size - 1] = b'0' + (n % 10) as u8;
133133
}
134134

135-
let (result, ..) = hash::<N>(buffers, size);
135+
let [result, ..] = hash_fixed(buffers, size);
136136

137137
for i in 0..N {
138138
if result[i] & 0xfffff000 == 0 {
@@ -151,13 +151,14 @@ mod simd {
151151
pub(super) fn worker(shared: &Shared) {
152152
while let Some(start) = shared.iter.next() {
153153
let (prefix, size) = format_string(&shared.prefix, start);
154-
let mut buffers = [prefix; 32];
154+
let buffers = &mut [prefix; 32];
155155

156156
for offset in (0..992).step_by(32) {
157-
check_hash_simd::<32>(&mut buffers, size, start, offset, shared);
157+
check_hash_simd(buffers, size, start, offset, shared);
158158
}
159159

160-
check_hash_simd::<8>(&mut buffers, size, start, 992, shared);
160+
let buffers = &mut [prefix; 8];
161+
check_hash_simd(buffers, size, start, 992, shared);
161162
}
162163
}
163164
}

src/year2016/day14.rs

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,10 @@ fn worker(shared: &Shared<'_>) {
6464

6565
if shared.part_two {
6666
for _ in 0..2016 {
67-
buffer[0..8].copy_from_slice(&to_ascii(result.0));
68-
buffer[8..16].copy_from_slice(&to_ascii(result.1));
69-
buffer[16..24].copy_from_slice(&to_ascii(result.2));
70-
buffer[24..32].copy_from_slice(&to_ascii(result.3));
67+
buffer[0..8].copy_from_slice(&to_ascii(result[0]));
68+
buffer[8..16].copy_from_slice(&to_ascii(result[1]));
69+
buffer[16..24].copy_from_slice(&to_ascii(result[2]));
70+
buffer[24..32].copy_from_slice(&to_ascii(result[3]));
7171
result = hash(&mut buffer, 32);
7272
}
7373
}
@@ -80,7 +80,7 @@ fn worker(shared: &Shared<'_>) {
8080
#[cfg(feature = "simd")]
8181
#[expect(clippy::needless_range_loop)]
8282
fn worker(shared: &Shared<'_>) {
83-
let mut result = ([0; 32], [0; 32], [0; 32], [0; 32]);
83+
let mut result = [[0; 32]; 4];
8484
let mut buffers = [[0; 64]; 32];
8585

8686
while let Some(start) = shared.iter.next() {
@@ -90,36 +90,36 @@ fn worker(shared: &Shared<'_>) {
9090
// Calculate the hash.
9191
for i in 0..32 {
9292
let (mut buffer, size) = format_string(shared.input, start + i as i32);
93-
let (a, b, c, d) = hash(&mut buffer, size);
93+
let [a, b, c, d] = hash(&mut buffer, size);
9494

95-
result.0[i] = a;
96-
result.1[i] = b;
97-
result.2[i] = c;
98-
result.3[i] = d;
95+
result[0][i] = a;
96+
result[1][i] = b;
97+
result[2][i] = c;
98+
result[3][i] = d;
9999
}
100100

101101
if shared.part_two {
102102
for _ in 0..2016 {
103103
for i in 0..32 {
104-
buffers[i][0..8].copy_from_slice(&to_ascii(result.0[i]));
105-
buffers[i][8..16].copy_from_slice(&to_ascii(result.1[i]));
106-
buffers[i][16..24].copy_from_slice(&to_ascii(result.2[i]));
107-
buffers[i][24..32].copy_from_slice(&to_ascii(result.3[i]));
104+
buffers[i][0..8].copy_from_slice(&to_ascii(result[0][i]));
105+
buffers[i][8..16].copy_from_slice(&to_ascii(result[1][i]));
106+
buffers[i][16..24].copy_from_slice(&to_ascii(result[2][i]));
107+
buffers[i][24..32].copy_from_slice(&to_ascii(result[3][i]));
108108
}
109-
result = simd::hash::<32>(&mut buffers, 32);
109+
result = simd::hash_fixed(&mut buffers, 32);
110110
}
111111
}
112112

113113
for i in 0..32 {
114-
let hash = (result.0[i], result.1[i], result.2[i], result.3[i]);
114+
let hash = [result[0][i], result[1][i], result[2][i], result[3][i]];
115115
check(shared, start + i as i32, hash);
116116
}
117117
}
118118
}
119119

120120
/// Check for sequences of 3 or 5 consecutive matching digits.
121-
fn check(shared: &Shared<'_>, n: i32, hash: (u32, u32, u32, u32)) {
122-
let (a, b, c, d) = hash;
121+
fn check(shared: &Shared<'_>, n: i32, hash: [u32; 4]) {
122+
let [a, b, c, d] = hash;
123123

124124
let mut prev = u32::MAX;
125125
let mut same = 1;

src/year2016/day17.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ fn explore(shared: &Shared, local: &mut State) {
127127
local.max = local.max.max(adjusted);
128128
} else {
129129
// Explore other paths.
130-
let (result, ..) = hash(&mut path, size);
130+
let [result, ..] = hash(&mut path, size);
131131

132132
if y > 0 && ((result >> 28) & 0xf) > 0xa {
133133
local.todo.push((x, y - 1, size + 1, extend(&path, size, b'U')));

0 commit comments

Comments
 (0)