1- #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
1+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" , target_arch = "wasm32" ) ) ]
22/// Permute Salsa20 block to column major order
33const PIVOT_ABCD : [ usize ; 16 ] = [ 0 , 5 , 10 , 15 , 4 , 9 , 14 , 3 , 8 , 13 , 2 , 7 , 12 , 1 , 6 , 11 ] ;
44
5- #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
5+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" , target_arch = "wasm32" ) ) ]
66/// Inverse of PIVOT_ABCD
77const INVERSE_PIVOT_ABCD : [ usize ; 16 ] = const {
88 let mut index = [ 0 ; 16 ] ;
@@ -41,7 +41,11 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
4141
4242 let len = b. len ( ) ;
4343
44- #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
44+ #[ cfg( any(
45+ target_arch = "x86" ,
46+ target_arch = "x86_64" ,
47+ all( target_arch = "wasm32" , target_feature = "simd128" )
48+ ) ) ]
4549 for chunk in b. chunks_exact_mut ( 64 ) {
4650 let mut t = [ 0u32 ; 16 ] ;
4751 for ( c, b) in chunk. chunks_exact ( 4 ) . zip ( t. iter_mut ( ) ) {
@@ -55,25 +59,45 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
5559 for chunk in v. chunks_mut ( len) {
5660 chunk. copy_from_slice ( b) ;
5761
58- #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
62+ #[ cfg( any(
63+ target_arch = "x86" ,
64+ target_arch = "x86_64" ,
65+ all( target_arch = "wasm32" , target_feature = "simd128" )
66+ ) ) ]
5967 scrypt_block_mix_abcd ( chunk, b) ;
6068
61- #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" ) ) ) ]
69+ #[ cfg( not( any(
70+ target_arch = "x86" ,
71+ target_arch = "x86_64" ,
72+ all( target_arch = "wasm32" , target_feature = "simd128" )
73+ ) ) ) ]
6274 scrypt_block_mix ( chunk, b) ;
6375 }
6476
6577 for _ in 0 ..n {
6678 let j = integerify ( b, n) ;
6779 xor ( b, & v[ j * len..( j + 1 ) * len] , t) ;
6880
69- #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
81+ #[ cfg( any(
82+ target_arch = "x86" ,
83+ target_arch = "x86_64" ,
84+ all( target_arch = "wasm32" , target_feature = "simd128" )
85+ ) ) ]
7086 scrypt_block_mix_abcd ( t, b) ;
7187
72- #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" ) ) ) ]
88+ #[ cfg( not( any(
89+ target_arch = "x86" ,
90+ target_arch = "x86_64" ,
91+ all( target_arch = "wasm32" , target_feature = "simd128" )
92+ ) ) ) ]
7393 scrypt_block_mix ( t, b) ;
7494 }
7595
76- #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
96+ #[ cfg( any(
97+ target_arch = "x86" ,
98+ target_arch = "x86_64" ,
99+ all( target_arch = "wasm32" , target_feature = "simd128" )
100+ ) ) ]
77101 for chunk in b. chunks_exact_mut ( 64 ) {
78102 let mut t = [ 0u32 ; 16 ] ;
79103 for ( c, b) in chunk. chunks_exact ( 4 ) . zip ( t. iter_mut ( ) ) {
@@ -88,7 +112,11 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
88112/// Execute the BlockMix operation
89113/// input - the input vector. The length must be a multiple of 128.
90114/// output - the output vector. Must be the same length as input.
91- #[ cfg( not( any( target_arch = "x86" , target_arch = "x86_64" ) ) ) ]
115+ #[ cfg( not( any(
116+ target_arch = "x86" ,
117+ target_arch = "x86_64" ,
118+ all( target_arch = "wasm32" , target_feature = "simd128" )
119+ ) ) ) ]
92120fn scrypt_block_mix ( input : & [ u8 ] , output : & mut [ u8 ] ) {
93121 use salsa20:: {
94122 SalsaCore ,
@@ -192,6 +220,72 @@ fn scrypt_block_mix_abcd(input: &[u8], output: &mut [u8]) {
192220 }
193221}
194222
223+ /// Execute the BlockMix operation with pre-shuffled input.
224+ /// input - the input vector. The length must be a multiple of 128.
225+ /// output - the output vector. Must be the same length as input.
226+ #[ cfg( all( target_arch = "wasm32" , target_feature = "simd128" ) ) ]
227+ fn scrypt_block_mix_abcd ( input : & [ u8 ] , output : & mut [ u8 ] ) {
228+ use core:: arch:: wasm32:: * ;
229+
230+ macro_rules! u32x4_rol {
231+ ( $x: expr, $amt: literal) => {
232+ v128_or( u32x4_shl( $x, $amt) , u32x4_shr( $x, 32 - $amt) )
233+ } ;
234+ }
235+
236+ let last_block = & input[ input. len ( ) - 64 ..] ;
237+
238+ let mut a = unsafe { v128_load ( last_block. as_ptr ( ) . cast ( ) ) } ;
239+ let mut b = unsafe { v128_load ( last_block. as_ptr ( ) . add ( 16 ) . cast ( ) ) } ;
240+ let mut c = unsafe { v128_load ( last_block. as_ptr ( ) . add ( 32 ) . cast ( ) ) } ;
241+ let mut d = unsafe { v128_load ( last_block. as_ptr ( ) . add ( 48 ) . cast ( ) ) } ;
242+
243+ for ( i, chunk) in input. chunks ( 64 ) . enumerate ( ) {
244+ let pos = if i % 2 == 0 {
245+ ( i / 2 ) * 64
246+ } else {
247+ ( i / 2 ) * 64 + input. len ( ) / 2
248+ } ;
249+
250+ unsafe {
251+ let chunk_a = v128_load ( chunk. as_ptr ( ) . cast ( ) ) ;
252+ let chunk_b = v128_load ( chunk. as_ptr ( ) . add ( 16 ) . cast ( ) ) ;
253+ let chunk_c = v128_load ( chunk. as_ptr ( ) . add ( 32 ) . cast ( ) ) ;
254+ let chunk_d = v128_load ( chunk. as_ptr ( ) . add ( 48 ) . cast ( ) ) ;
255+
256+ a = v128_xor ( a, chunk_a) ;
257+ b = v128_xor ( b, chunk_b) ;
258+ c = v128_xor ( c, chunk_c) ;
259+ d = v128_xor ( d, chunk_d) ;
260+
261+ let saves = [ a, b, c, d] ;
262+
263+ for _ in 0 ..8 {
264+ b = v128_xor ( b, u32x4_rol ! ( u32x4_add( a, d) , 7 ) ) ;
265+ c = v128_xor ( c, u32x4_rol ! ( u32x4_add( b, a) , 9 ) ) ;
266+ d = v128_xor ( d, u32x4_rol ! ( u32x4_add( c, b) , 13 ) ) ;
267+ a = v128_xor ( a, u32x4_rol ! ( u32x4_add( d, c) , 18 ) ) ;
268+
269+ d = i32x4_shuffle :: < 1 , 2 , 3 , 0 > ( d, d) ;
270+ c = i32x4_shuffle :: < 2 , 3 , 0 , 1 > ( c, c) ;
271+ b = i32x4_shuffle :: < 3 , 0 , 1 , 2 > ( b, b) ;
272+
273+ ( b, d) = ( d, b) ;
274+ }
275+
276+ a = u32x4_add ( a, saves[ 0 ] ) ;
277+ b = u32x4_add ( b, saves[ 1 ] ) ;
278+ c = u32x4_add ( c, saves[ 2 ] ) ;
279+ d = u32x4_add ( d, saves[ 3 ] ) ;
280+
281+ v128_store ( output. as_mut_ptr ( ) . add ( pos) . cast ( ) , a) ;
282+ v128_store ( output. as_mut_ptr ( ) . add ( pos + 16 ) . cast ( ) , b) ;
283+ v128_store ( output. as_mut_ptr ( ) . add ( pos + 32 ) . cast ( ) , c) ;
284+ v128_store ( output. as_mut_ptr ( ) . add ( pos + 48 ) . cast ( ) , d) ;
285+ }
286+ }
287+ }
288+
195289fn xor ( x : & [ u8 ] , y : & [ u8 ] , output : & mut [ u8 ] ) {
196290 for ( ( out, & x_i) , & y_i) in output. iter_mut ( ) . zip ( x. iter ( ) ) . zip ( y. iter ( ) ) {
197291 * out = x_i ^ y_i;
0 commit comments