1515//! although perhaps adding [balancing rotations](https://en.wikipedia.org/wiki/Tree_rotation)
1616//! to the tree would make it faster.
1717use crate :: util:: parse:: * ;
18+ use std:: array:: from_fn;
19+ use std:: iter:: repeat_n;
20+
21+ struct PaddedVec {
22+ size : usize ,
23+ vec : Vec < u16 > ,
24+ }
1825
1926pub fn parse ( input : & str ) -> Vec < i64 > {
2027 input. iter_signed ( ) . collect ( )
@@ -32,72 +39,70 @@ fn decrypt(input: &[i64], key: i64, rounds: usize) -> i64 {
3239 // Important nuance, size is one less because we don't consider the moving number.
3340 let size = input. len ( ) - 1 ;
3441 // Another nuance, input contain duplicate numbers, so use index to refer to each number uniquely.
35- let indices: Vec < _ > = ( 0 ..input. len ( ) ) . collect ( ) ;
42+ let indices: Vec < _ > = ( 0 ..input. len ( ) as u16 ) . collect ( ) ;
3643 // Pre-process the numbers, coverting any negative indices to positive indices that will wrap.
3744 // For example, -1 becomes 4998.
3845 let numbers: Vec < _ > =
39- input. iter ( ) . map ( |n| ( n * key) . rem_euclid ( size as i64 ) as usize ) . collect ( ) ;
40-
41- // Store first and second level indices.
42- let mut lookup = Vec :: new ( ) ;
43- // Triple nested vec of numbers.
44- let mut mixed = Vec :: new ( ) ;
45- // Size of each first level element for convenience.
46- let mut skip = Vec :: new ( ) ;
47-
48- // Break 5000 numbers into roughly equals chunks at each level. 289 = 17 * 17.
49- for first in indices. chunks ( 289 ) {
50- let mut outer = Vec :: new ( ) ;
51-
52- for second in first. chunks ( 17 ) {
53- // Initial first and second level indices.
54- ( 0 ..second. len ( ) ) . for_each ( |_| lookup. push ( ( mixed. len ( ) , outer. len ( ) ) ) ) ;
55-
56- // Leave some extra room, as mixing won't balance evenly.
57- let mut inner = Vec :: with_capacity ( 100 ) ;
58- inner. extend_from_slice ( second) ;
59-
60- outer. push ( inner) ;
61- }
62-
63- mixed. push ( outer) ;
64- skip. push ( first. len ( ) ) ;
46+ input. iter ( ) . map ( |& n| ( n * key) . rem_euclid ( size as i64 ) as usize ) . collect ( ) ;
47+ // Store location of each number within `mixed` for faster lookup.
48+ let mut lookup = Vec :: with_capacity ( input. len ( ) ) ;
49+ // Size of each block of 16 elements for faster lookup.
50+ let mut skip = [ 0 ; 16 ] ;
51+ // Break 5000 numbers into roughly equals chunks.
52+ let mut mixed: [ _ ; 256 ] = from_fn ( |_| PaddedVec { size : 0 , vec : Vec :: with_capacity ( 128 ) } ) ;
53+
54+ for ( second, slice) in indices. chunks ( input. len ( ) . div_ceil ( 256 ) ) . enumerate ( ) {
55+ let size = slice. len ( ) ;
56+
57+ mixed[ second] . size = size;
58+ mixed[ second] . vec . resize ( size. next_multiple_of ( 64 ) , 0 ) ;
59+ mixed[ second] . vec [ ..size] . copy_from_slice ( slice) ;
60+
61+ lookup. extend ( repeat_n ( second, size) ) ;
62+ skip[ second / 16 ] += size;
6563 }
6664
6765 for _ in 0 ..rounds {
6866 ' mix: for index in 0 ..input. len ( ) {
6967 // Quickly find the leaf vector storing the number.
7068 let number = numbers[ index] ;
71- let ( first, second) = lookup[ index] ;
69+ let second = lookup[ index] ;
70+ let first = second / 16 ;
71+
7272 // Third level changes as other numbers are added and removed,
7373 // so needs to be checked each time.
74- let third = mixed[ first ] [ second] . iter ( ) . position ( | & i| i == index ) . unwrap ( ) ;
74+ let third = position ( & mixed[ second] , index as u16 ) ;
7575
7676 // Find the offset of the number by adding the size of all previous `vec`s.
7777 let position = third
7878 + skip[ ..first] . iter ( ) . sum :: < usize > ( )
79- + mixed[ first] [ ..second] . iter ( ) . map ( Vec :: len ) . sum :: < usize > ( ) ;
79+ + mixed[ 16 * first..second] . iter ( ) . map ( |v| v . size ) . sum :: < usize > ( ) ;
8080 // Update our position, wrapping around if necessary.
8181 let mut next = ( position + number) % size;
8282
8383 // Remove number from current leaf vector, also updating the first level size.
84- mixed[ first] [ second] . remove ( third) ;
84+ mixed[ second] . size -= 1 ;
85+ mixed[ second] . vec . remove ( third) ;
86+ mixed[ second] . vec . push ( 0 ) ;
8587 skip[ first] -= 1 ;
8688
8789 // Find our new destination, by checking `vec`s in order until the total elements
8890 // are greater than our new index.
89- for ( first, outer) in mixed. iter_mut ( ) . enumerate ( ) {
91+ for ( first, outer) in mixed. chunks_exact_mut ( 16 ) . enumerate ( ) {
9092 if next > skip[ first] {
9193 next -= skip[ first] ;
9294 } else {
9395 for ( second, inner) in outer. iter_mut ( ) . enumerate ( ) {
94- if next > inner. len ( ) {
95- next -= inner. len ( ) ;
96+ if next > inner. size {
97+ next -= inner. size ;
9698 } else {
9799 // Insert number into its new home.
98- inner. insert ( next, index) ;
100+ inner. size += 1 ;
101+ inner. vec . insert ( next, index as u16 ) ;
102+ inner. vec . resize ( inner. size . next_multiple_of ( 64 ) , 0 ) ;
103+ // Update location.
99104 skip[ first] += 1 ;
100- lookup[ index] = ( first, second) ;
105+ lookup[ index] = 16 * first + second;
101106 continue ' mix;
102107 }
103108 }
@@ -106,12 +111,44 @@ fn decrypt(input: &[i64], key: i64, rounds: usize) -> i64 {
106111 }
107112 }
108113
109- let indices: Vec < _ > = mixed. into_iter ( ) . flatten ( ) . flatten ( ) . collect ( ) ;
110- let zeroth = indices. iter ( ) . position ( |& i| input[ i] == 0 ) . unwrap ( ) ;
114+ let indices: Vec < _ > =
115+ mixed. into_iter ( ) . flat_map ( |pv| pv. vec . into_iter ( ) . take ( pv. size ) ) . collect ( ) ;
116+ let zeroth = indices. iter ( ) . position ( |& i| input[ i as usize ] == 0 ) . unwrap ( ) ;
111117
112118 [ 1000 , 2000 , 3000 ]
113119 . iter ( )
114120 . map ( |offset| ( zeroth + offset) % indices. len ( ) )
115- . map ( |index| input[ indices[ index] ] * key)
121+ . map ( |index| input[ indices[ index] as usize ] * key)
116122 . sum ( )
117123}
124+
125+ /// The compiler optimizes the position search when the size of the chunk is known.
126+ #[ cfg( not( feature = "simd" ) ) ]
127+ #[ inline]
128+ fn position ( haystack : & PaddedVec , needle : u16 ) -> usize {
129+ for ( base, slice) in haystack. vec . chunks_exact ( 64 ) . enumerate ( ) {
130+ if let Some ( offset) = slice. iter ( ) . position ( |& i| i == needle) {
131+ return 64 * base + offset;
132+ }
133+ }
134+
135+ unreachable ! ( )
136+ }
137+
138+ /// Search 64 lanes simultaneously.
139+ #[ cfg( feature = "simd" ) ]
140+ #[ inline]
141+ fn position ( haystack : & PaddedVec , needle : u16 ) -> usize {
142+ use std:: simd:: cmp:: SimdPartialEq as _;
143+ use std:: simd:: * ;
144+
145+ for ( base, slice) in haystack. vec . chunks_exact ( 64 ) . enumerate ( ) {
146+ if let Some ( offset) =
147+ Simd :: < u16 , 64 > :: from_slice ( slice) . simd_eq ( Simd :: splat ( needle) ) . first_set ( )
148+ {
149+ return 64 * base + offset;
150+ }
151+ }
152+
153+ unreachable ! ( )
154+ }
0 commit comments