1+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
2+ /// Permute Salsa20 block to column major order
3+ const PIVOT_ABCD : [ usize ; 16 ] = [ 0 , 5 , 10 , 15 , 4 , 9 , 14 , 3 , 8 , 13 , 2 , 7 , 12 , 1 , 6 , 11 ] ;
4+
5+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
6+ /// Inverse of PIVOT_ABCD
7+ const INVERSE_PIVOT_ABCD : [ usize ; 16 ] = const {
8+ let mut index = [ 0 ; 16 ] ;
9+ let mut i = 0 ;
10+ while i < 16 {
11+ let mut inverse = 0 ;
12+ while inverse < 16 {
13+ if PIVOT_ABCD [ inverse] == i {
14+ index[ i] = inverse;
15+ break ;
16+ }
17+ inverse += 1 ;
18+ }
19+ i += 1 ;
20+ }
21+ index
22+ } ;
23+
124/// Execute the ROMix operation in-place.
225/// b - the data to operate on
326/// v - a temporary variable to store the vector V
@@ -18,6 +41,17 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
1841
1942 let len = b. len ( ) ;
2043
44+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
45+ for chunk in b. chunks_exact_mut ( 64 ) {
46+ let mut t = [ 0u32 ; 16 ] ;
47+ for ( c, b) in chunk. chunks_exact ( 4 ) . zip ( t. iter_mut ( ) ) {
48+ * b = u32:: from_ne_bytes ( c. try_into ( ) . unwrap ( ) ) ;
49+ }
50+ chunk. chunks_exact_mut ( 4 ) . enumerate ( ) . for_each ( |( i, b) | {
51+ b. copy_from_slice ( & t[ PIVOT_ABCD [ i] ] . to_ne_bytes ( ) ) ;
52+ } ) ;
53+ }
54+
2155 for chunk in v. chunks_mut ( len) {
2256 chunk. copy_from_slice ( b) ;
2357 scrypt_block_mix ( chunk, b) ;
@@ -28,11 +62,23 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
2862 xor ( b, & v[ j * len..( j + 1 ) * len] , t) ;
2963 scrypt_block_mix ( t, b) ;
3064 }
65+
66+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
67+ for chunk in b. chunks_exact_mut ( 64 ) {
68+ let mut t = [ 0u32 ; 16 ] ;
69+ for ( c, b) in chunk. chunks_exact ( 4 ) . zip ( t. iter_mut ( ) ) {
70+ * b = u32:: from_ne_bytes ( c. try_into ( ) . unwrap ( ) ) ;
71+ }
72+ chunk. chunks_exact_mut ( 4 ) . enumerate ( ) . for_each ( |( i, b) | {
73+ b. copy_from_slice ( & t[ INVERSE_PIVOT_ABCD [ i] ] . to_ne_bytes ( ) ) ;
74+ } ) ;
75+ }
3176}
3277
3378/// Execute the BlockMix operation
3479/// input - the input vector. The length must be a multiple of 128.
3580/// output - the output vector. Must be the same length as input.
81+ #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" ) ) ) ]
3682fn scrypt_block_mix ( input : & [ u8 ] , output : & mut [ u8 ] ) {
3783 use salsa20:: {
3884 SalsaCore ,
@@ -67,6 +113,76 @@ fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
67113 }
68114}
69115
116+ /// Execute the BlockMix operation
117+ /// input - the input vector. The length must be a multiple of 128.
118+ /// output - the output vector. Must be the same length as input.
119+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
120+ fn scrypt_block_mix ( input : & [ u8 ] , output : & mut [ u8 ] ) {
121+ #[ cfg( target_arch = "x86" ) ]
122+ use core:: arch:: x86:: * ;
123+
124+ #[ cfg( target_arch = "x86_64" ) ]
125+ use core:: arch:: x86_64:: * ;
126+
127+ macro_rules! mm_rol_epi32x {
128+ ( $w: expr, $amt: literal) => { {
129+ let w = $w;
130+ _mm_or_si128( _mm_slli_epi32( w, $amt) , _mm_srli_epi32( w, 32 - $amt) )
131+ } } ;
132+ }
133+
134+ let mut x = [ 0u8 ; 64 ] ;
135+ x. copy_from_slice ( & input[ input. len ( ) - 64 ..] ) ;
136+
137+ let mut a = unsafe { _mm_loadu_si128 ( x. as_ptr ( ) . cast ( ) ) } ;
138+ let mut b = unsafe { _mm_loadu_si128 ( x. as_ptr ( ) . add ( 16 ) . cast ( ) ) } ;
139+ let mut c = unsafe { _mm_loadu_si128 ( x. as_ptr ( ) . add ( 32 ) . cast ( ) ) } ;
140+ let mut d = unsafe { _mm_loadu_si128 ( x. as_ptr ( ) . add ( 48 ) . cast ( ) ) } ;
141+
142+ for ( i, chunk) in input. chunks ( 64 ) . enumerate ( ) {
143+ let pos = if i % 2 == 0 {
144+ ( i / 2 ) * 64
145+ } else {
146+ ( i / 2 ) * 64 + input. len ( ) / 2
147+ } ;
148+
149+ unsafe {
150+ a = _mm_xor_si128 ( a, _mm_loadu_si128 ( chunk. as_ptr ( ) . cast ( ) ) ) ;
151+ b = _mm_xor_si128 ( b, _mm_loadu_si128 ( chunk. as_ptr ( ) . add ( 16 ) . cast ( ) ) ) ;
152+ c = _mm_xor_si128 ( c, _mm_loadu_si128 ( chunk. as_ptr ( ) . add ( 32 ) . cast ( ) ) ) ;
153+ d = _mm_xor_si128 ( d, _mm_loadu_si128 ( chunk. as_ptr ( ) . add ( 48 ) . cast ( ) ) ) ;
154+
155+ let saves = [ a, b, c, d] ;
156+
157+ for _ in 0 ..8 {
158+ b = _mm_xor_si128 ( b, mm_rol_epi32x ! ( _mm_add_epi32( a, d) , 7 ) ) ;
159+ c = _mm_xor_si128 ( c, mm_rol_epi32x ! ( _mm_add_epi32( b, a) , 9 ) ) ;
160+ d = _mm_xor_si128 ( d, mm_rol_epi32x ! ( _mm_add_epi32( c, b) , 13 ) ) ;
161+ a = _mm_xor_si128 ( a, mm_rol_epi32x ! ( _mm_add_epi32( d, c) , 18 ) ) ;
162+
163+ // a stays in place
164+ // b = left shuffle d by 1 element
165+ d = _mm_shuffle_epi32 ( d, 0b00_11_10_01 ) ;
166+ // c = left shuffle c by 2 elements
167+ c = _mm_shuffle_epi32 ( c, 0b01_00_11_10 ) ;
168+ // d = left shuffle b by 3 elements
169+ b = _mm_shuffle_epi32 ( b, 0b10_01_00_11 ) ;
170+ ( b, d) = ( d, b) ;
171+ }
172+
173+ a = _mm_add_epi32 ( a, saves[ 0 ] ) ;
174+ b = _mm_add_epi32 ( b, saves[ 1 ] ) ;
175+ c = _mm_add_epi32 ( c, saves[ 2 ] ) ;
176+ d = _mm_add_epi32 ( d, saves[ 3 ] ) ;
177+
178+ _mm_storeu_si128 ( output. as_mut_ptr ( ) . add ( pos) . cast ( ) , a) ;
179+ _mm_storeu_si128 ( output. as_mut_ptr ( ) . add ( pos + 16 ) . cast ( ) , b) ;
180+ _mm_storeu_si128 ( output. as_mut_ptr ( ) . add ( pos + 32 ) . cast ( ) , c) ;
181+ _mm_storeu_si128 ( output. as_mut_ptr ( ) . add ( pos + 48 ) . cast ( ) , d) ;
182+ }
183+ }
184+ }
185+
70186fn xor ( x : & [ u8 ] , y : & [ u8 ] , output : & mut [ u8 ] ) {
71187 for ( ( out, & x_i) , & y_i) in output. iter_mut ( ) . zip ( x. iter ( ) ) . zip ( y. iter ( ) ) {
72188 * out = x_i ^ y_i;
0 commit comments