Skip to content

Commit 29f1a5d

Browse files
committed
Speed up MD5 hashing by skipping gather operation where possible
1 parent 8c933a5 commit 29f1a5d

File tree

6 files changed

+88
-110
lines changed

6 files changed

+88
-110
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ Improvements to solutions are always appreciated. Please see the
5858
## Performance
5959

6060
Benchmarks are measured using the built-in `cargo bench` tool run on an [Apple M2 Max][apple-link].
61-
All 250 solutions from 2024 to 2015 complete sequentially in **510 milliseconds**.
61+
All 250 solutions from 2024 to 2015 complete sequentially in **504 milliseconds**.
6262
Interestingly 86% of the total time is spent on just 9 solutions.
6363
Performance is reasonable even on older hardware, for example a 2011 MacBook Pro with an
6464
[Intel i7-2720QM][intel-link] processor takes 3.5 seconds to run the same 250 solutions.
@@ -67,7 +67,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
6767

6868
| Year | [2015](#2015) | [2016](#2016) | [2017](#2017) | [2018](#2018) | [2019](#2019) | [2020](#2020) | [2021](#2021) | [2022](#2022) | [2023](#2023) | [2024](#2024) |
6969
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
70-
| Benchmark (ms) | 17 | 117 | 82 | 35 | 15 | 220 | 8 | 7 | 5 | 4 |
70+
| Benchmark (ms) | 17 | 111 | 82 | 35 | 15 | 220 | 8 | 7 | 5 | 4 |
7171

7272
## 2024
7373

@@ -335,7 +335,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
335335
| 2 | [Bathroom Security](https://adventofcode.com/2016/day/2) | [Source](src/year2016/day02.rs) | 29 |
336336
| 3 | [Squares With Three Sides](https://adventofcode.com/2016/day/3) | [Source](src/year2016/day03.rs) | 24 |
337337
| 4 | [Security Through Obscurity](https://adventofcode.com/2016/day/4) | [Source](src/year2016/day04.rs) | 79 |
338-
| 5 | [How About a Nice Game of Chess?](https://adventofcode.com/2016/day/5) | [Source](src/year2016/day05.rs) | 35000 |
338+
| 5 | [How About a Nice Game of Chess?](https://adventofcode.com/2016/day/5) | [Source](src/year2016/day05.rs) | 34000 |
339339
| 6 | [Signals and Noise](https://adventofcode.com/2016/day/6) | [Source](src/year2016/day06.rs) | 3 |
340340
| 7 | [Internet Protocol Version 7](https://adventofcode.com/2016/day/7) | [Source](src/year2016/day07.rs) | 364 |
341341
| 8 | [Two-Factor Authentication](https://adventofcode.com/2016/day/8) | [Source](src/year2016/day08.rs) | 9 |
@@ -344,7 +344,7 @@ Performance is reasonable even on older hardware, for example a 2011 MacBook Pro
344344
| 11 | [Radioisotope Thermoelectric Generators](https://adventofcode.com/2016/day/11) | [Source](src/year2016/day11.rs) | 719 |
345345
| 12 | [Leonardo's Monorail](https://adventofcode.com/2016/day/12) | [Source](src/year2016/day12.rs) | 1 |
346346
| 13 | [A Maze of Twisty Little Cubicles](https://adventofcode.com/2016/day/13) | [Source](src/year2016/day13.rs) | 3 |
347-
| 14 | [One-Time Pad](https://adventofcode.com/2016/day/14) | [Source](src/year2016/day14.rs) | 77000 |
347+
| 14 | [One-Time Pad](https://adventofcode.com/2016/day/14) | [Source](src/year2016/day14.rs) | 72000 |
348348
| 15 | [Timing is Everything](https://adventofcode.com/2016/day/15) | [Source](src/year2016/day15.rs) | 1 |
349349
| 16 | [Dragon Checksum](https://adventofcode.com/2016/day/16) | [Source](src/year2016/day16.rs) | 1 |
350350
| 17 | [Two Steps Forward](https://adventofcode.com/2016/day/17) | [Source](src/year2016/day17.rs) | 3858 |

src/util/md5.rs

Lines changed: 49 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -11,36 +11,27 @@
1111
//! [`#[inline]`](https://doc.rust-lang.org/reference/attributes/codegen.html#the-inline-attribute).
1212
//!
1313
//! An optional SIMD variant that computes multiple hashes in parallel is also implemented.
14-
1514
pub fn buffer_size(n: usize) -> usize {
1615
(n + 9).next_multiple_of(64)
1716
}
1817

19-
pub fn hash(mut buffer: &mut [u8], size: usize) -> (u32, u32, u32, u32) {
18+
#[inline]
19+
pub fn hash(buffer: &mut [u8], size: usize) -> [u32; 4] {
2020
let end = buffer.len() - 8;
2121
let bits = size * 8;
2222

2323
buffer[size] = 0x80;
2424
buffer[end..].copy_from_slice(&bits.to_le_bytes());
2525

2626
let mut m = [0; 16];
27-
let mut a0: u32 = 0x67452301;
28-
let mut b0: u32 = 0xefcdab89;
29-
let mut c0: u32 = 0x98badcfe;
30-
let mut d0: u32 = 0x10325476;
27+
let [mut a0, mut b0, mut c0, mut d0] = [0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476];
3128

32-
while !buffer.is_empty() {
33-
let (prefix, suffix) = buffer.split_at_mut(64);
34-
buffer = suffix;
35-
36-
for (i, chunk) in prefix.chunks_exact(4).enumerate() {
29+
for block in buffer.chunks_exact(64) {
30+
for (i, chunk) in block.chunks_exact(4).enumerate() {
3731
m[i] = u32::from_le_bytes(chunk.try_into().unwrap());
3832
}
3933

40-
let mut a = a0;
41-
let mut b = b0;
42-
let mut c = c0;
43-
let mut d = d0;
34+
let [mut a, mut b, mut c, mut d] = [a0, b0, c0, d0];
4435

4536
a = round1(a, b, c, d, m[0], 7, 0xd76aa478);
4637
d = round1(d, a, b, c, m[1], 12, 0xe8c7b756);
@@ -110,13 +101,11 @@ pub fn hash(mut buffer: &mut [u8], size: usize) -> (u32, u32, u32, u32) {
110101
c = round4(c, d, a, b, m[2], 15, 0x2ad7d2bb);
111102
b = round4(b, c, d, a, m[9], 21, 0xeb86d391);
112103

113-
a0 = a0.wrapping_add(a);
114-
b0 = b0.wrapping_add(b);
115-
c0 = c0.wrapping_add(c);
116-
d0 = d0.wrapping_add(d);
104+
[a0, b0, c0, d0] =
105+
[a0.wrapping_add(a), b0.wrapping_add(b), c0.wrapping_add(c), d0.wrapping_add(d)];
117106
}
118107

119-
(a0.to_be(), b0.to_be(), c0.to_be(), d0.to_be())
108+
[a0.to_be(), b0.to_be(), c0.to_be(), d0.to_be()]
120109
}
121110

122111
#[inline]
@@ -150,69 +139,59 @@ fn common(f: u32, a: u32, b: u32, m: u32, s: u32, k: u32) -> u32 {
150139

151140
#[cfg(feature = "simd")]
152141
pub mod simd {
153-
use std::array;
142+
use std::array::from_fn;
154143
use std::simd::num::SimdUint as _;
155144
use std::simd::{LaneCount, Simd, SupportedLaneCount};
156145

157146
#[inline]
158-
#[expect(clippy::too_many_lines)]
159-
pub fn hash<const N: usize>(
160-
buffers: &mut [[u8; 64]],
161-
size: usize,
162-
) -> ([u32; N], [u32; N], [u32; N], [u32; N])
147+
pub fn hash_fixed<const N: usize>(buffers: &mut [[u8; 64]; N], size: usize) -> [[u32; N]; 4]
163148
where
164149
LaneCount<N>: SupportedLaneCount,
165150
{
166151
// Assume all buffers are the same size.
167-
let end = 64 - 8;
168-
let bits = size * 8;
169-
170152
for buffer in buffers.iter_mut() {
171153
buffer[size] = 0x80;
172-
buffer[end..].copy_from_slice(&bits.to_le_bytes());
173154
}
174155

175-
let mut a0: Simd<u32, N> = Simd::splat(0x67452301);
176-
let mut b0: Simd<u32, N> = Simd::splat(0xefcdab89);
177-
let mut c0: Simd<u32, N> = Simd::splat(0x98badcfe);
178-
let mut d0: Simd<u32, N> = Simd::splat(0x10325476);
156+
let [a0, b0, c0, d0] = [
157+
Simd::splat(0x67452301),
158+
Simd::splat(0xefcdab89),
159+
Simd::splat(0x98badcfe),
160+
Simd::splat(0x10325476),
161+
];
162+
let [mut a, mut b, mut c, mut d] = [a0, b0, c0, d0];
179163

180-
let mut a = a0;
181-
let mut b = b0;
182-
let mut c = c0;
183-
let mut d = d0;
184-
185-
let m0 = message(buffers, 0);
164+
let m0 = message(buffers, 0, size);
186165
a = round1(a, b, c, d, m0, 7, 0xd76aa478);
187-
let m1 = message(buffers, 1);
166+
let m1 = message(buffers, 4, size);
188167
d = round1(d, a, b, c, m1, 12, 0xe8c7b756);
189-
let m2 = message(buffers, 2);
168+
let m2 = message(buffers, 8, size);
190169
c = round1(c, d, a, b, m2, 17, 0x242070db);
191-
let m3 = message(buffers, 3);
170+
let m3 = message(buffers, 12, size);
192171
b = round1(b, c, d, a, m3, 22, 0xc1bdceee);
193-
let m4 = message(buffers, 4);
172+
let m4 = message(buffers, 16, size);
194173
a = round1(a, b, c, d, m4, 7, 0xf57c0faf);
195-
let m5 = message(buffers, 5);
174+
let m5 = message(buffers, 20, size);
196175
d = round1(d, a, b, c, m5, 12, 0x4787c62a);
197-
let m6 = message(buffers, 6);
176+
let m6 = message(buffers, 24, size);
198177
c = round1(c, d, a, b, m6, 17, 0xa8304613);
199-
let m7 = message(buffers, 7);
178+
let m7 = message(buffers, 28, size);
200179
b = round1(b, c, d, a, m7, 22, 0xfd469501);
201-
let m8 = message(buffers, 8);
180+
let m8 = message(buffers, 32, size);
202181
a = round1(a, b, c, d, m8, 7, 0x698098d8);
203-
let m9 = message(buffers, 9);
182+
let m9 = message(buffers, 36, size);
204183
d = round1(d, a, b, c, m9, 12, 0x8b44f7af);
205-
let m10 = message(buffers, 10);
184+
let m10 = message(buffers, 40, size);
206185
c = round1(c, d, a, b, m10, 17, 0xffff5bb1);
207-
let m11 = message(buffers, 11);
186+
let m11 = message(buffers, 44, size);
208187
b = round1(b, c, d, a, m11, 22, 0x895cd7be);
209-
let m12 = message(buffers, 12);
188+
let m12 = message(buffers, 48, size);
210189
a = round1(a, b, c, d, m12, 7, 0x6b901122);
211-
let m13 = message(buffers, 13);
190+
let m13 = message(buffers, 52, size);
212191
d = round1(d, a, b, c, m13, 12, 0xfd987193);
213-
let m14 = message(buffers, 14);
192+
let m14 = Simd::splat(size as u32 * 8);
214193
c = round1(c, d, a, b, m14, 17, 0xa679438e);
215-
let m15 = message(buffers, 15);
194+
let m15 = Simd::splat(0);
216195
b = round1(b, c, d, a, m15, 22, 0x49b40821);
217196

218197
a = round2(a, b, c, d, m1, 5, 0xf61e2562);
@@ -266,30 +245,27 @@ pub mod simd {
266245
c = round4(c, d, a, b, m2, 15, 0x2ad7d2bb);
267246
b = round4(b, c, d, a, m9, 21, 0xeb86d391);
268247

269-
a0 += a;
270-
b0 += b;
271-
c0 += c;
272-
d0 += d;
273-
274-
(
275-
a0.swap_bytes().to_array(),
276-
b0.swap_bytes().to_array(),
277-
c0.swap_bytes().to_array(),
278-
d0.swap_bytes().to_array(),
279-
)
248+
[
249+
(a0 + a).swap_bytes().to_array(),
250+
(b0 + b).swap_bytes().to_array(),
251+
(c0 + c).swap_bytes().to_array(),
252+
(d0 + d).swap_bytes().to_array(),
253+
]
280254
}
281255

282256
#[inline]
283-
fn message<const N: usize>(buffers: &mut [[u8; 64]], i: usize) -> Simd<u32, N>
257+
fn message<const N: usize>(buffers: &[[u8; 64]; N], i: usize, size: usize) -> Simd<u32, N>
284258
where
285259
LaneCount<N>: SupportedLaneCount,
286260
{
287-
let start = 4 * i;
288-
let end = start + 4;
289-
Simd::from_array(array::from_fn(|lane| {
290-
let slice = &buffers[lane][start..end];
291-
u32::from_le_bytes(slice.try_into().unwrap())
292-
}))
261+
if i > size {
262+
Simd::splat(0)
263+
} else {
264+
Simd::from_array(from_fn(|lane| {
265+
let slice = &buffers[lane][i..i + 4];
266+
u32::from_le_bytes(slice.try_into().unwrap())
267+
}))
268+
}
293269
}
294270

295271
#[inline]

src/year2015/day04.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ fn format_string(prefix: &str, n: u32) -> ([u8; 64], usize) {
7272
}
7373

7474
fn check_hash(buffer: &mut [u8], size: usize, n: u32, shared: &Shared) {
75-
let (result, ..) = hash(buffer, size);
75+
let [result, ..] = hash(buffer, size);
7676

7777
if result & 0xffffff00 == 0 {
7878
shared.second.fetch_min(n, Ordering::Relaxed);
@@ -101,12 +101,12 @@ fn worker(shared: &Shared) {
101101
#[cfg(feature = "simd")]
102102
mod simd {
103103
use super::*;
104-
use crate::util::md5::simd::hash;
104+
use crate::util::md5::simd::hash_fixed;
105105
use std::simd::{LaneCount, SupportedLaneCount};
106106

107107
#[expect(clippy::needless_range_loop)]
108108
fn check_hash_simd<const N: usize>(
109-
buffers: &mut [[u8; 64]],
109+
buffers: &mut [[u8; 64]; N],
110110
size: usize,
111111
start: u32,
112112
offset: u32,
@@ -122,7 +122,7 @@ mod simd {
122122
buffers[i][size - 1] = b'0' + (n % 10) as u8;
123123
}
124124

125-
let (result, ..) = hash::<N>(buffers, size);
125+
let [result, ..] = hash_fixed(buffers, size);
126126

127127
for i in 0..N {
128128
if result[i] & 0xffffff00 == 0 {
@@ -137,13 +137,14 @@ mod simd {
137137
pub(super) fn worker(shared: &Shared) {
138138
while let Some(start) = shared.iter.next() {
139139
let (prefix, size) = format_string(&shared.prefix, start);
140-
let mut buffers = [prefix; 32];
140+
let buffers = &mut [prefix; 32];
141141

142142
for offset in (0..992).step_by(32) {
143-
check_hash_simd::<32>(&mut buffers, size, start, offset, shared);
143+
check_hash_simd(buffers, size, start, offset, shared);
144144
}
145145

146-
check_hash_simd::<8>(&mut buffers, size, start, 992, shared);
146+
let buffers = &mut [prefix; 8];
147+
check_hash_simd(buffers, size, start, 992, shared);
147148
}
148149
}
149150
}

src/year2016/day05.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ fn format_string(prefix: &str, n: u32) -> ([u8; 64], usize) {
7878
}
7979

8080
fn check_hash(buffer: &mut [u8], size: usize, n: u32, shared: &Shared) {
81-
let (result, ..) = hash(buffer, size);
81+
let [result, ..] = hash(buffer, size);
8282

8383
if result & 0xfffff000 == 0 {
8484
let mut exclusive = shared.mutex.lock().unwrap();
@@ -111,12 +111,12 @@ fn worker(shared: &Shared) {
111111
#[cfg(feature = "simd")]
112112
mod simd {
113113
use super::*;
114-
use crate::util::md5::simd::hash;
114+
use crate::util::md5::simd::hash_fixed;
115115
use std::simd::{LaneCount, SupportedLaneCount};
116116

117117
#[expect(clippy::needless_range_loop)]
118118
fn check_hash_simd<const N: usize>(
119-
buffers: &mut [[u8; 64]],
119+
buffers: &mut [[u8; 64]; N],
120120
size: usize,
121121
start: u32,
122122
offset: u32,
@@ -132,7 +132,7 @@ mod simd {
132132
buffers[i][size - 1] = b'0' + (n % 10) as u8;
133133
}
134134

135-
let (result, ..) = hash::<N>(buffers, size);
135+
let [result, ..] = hash_fixed(buffers, size);
136136

137137
for i in 0..N {
138138
if result[i] & 0xfffff000 == 0 {
@@ -151,13 +151,14 @@ mod simd {
151151
pub(super) fn worker(shared: &Shared) {
152152
while let Some(start) = shared.iter.next() {
153153
let (prefix, size) = format_string(&shared.prefix, start);
154-
let mut buffers = [prefix; 32];
154+
let buffers = &mut [prefix; 32];
155155

156156
for offset in (0..992).step_by(32) {
157-
check_hash_simd::<32>(&mut buffers, size, start, offset, shared);
157+
check_hash_simd(buffers, size, start, offset, shared);
158158
}
159159

160-
check_hash_simd::<8>(&mut buffers, size, start, 992, shared);
160+
let buffers = &mut [prefix; 8];
161+
check_hash_simd(buffers, size, start, 992, shared);
161162
}
162163
}
163164
}

0 commit comments

Comments
 (0)