Skip to content

Commit c65ad4e

Browse files
wasm32 kernel
Signed-off-by: eternal-flame-AD <yume@yumechi.jp>
1 parent e991eb9 commit c65ad4e

File tree

1 file changed

+103
-9
lines changed

1 file changed

+103
-9
lines changed

scrypt/src/romix.rs

Lines changed: 103 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1+
#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "wasm32"))]
22
/// Permute Salsa20 block to column major order
33
const PIVOT_ABCD: [usize; 16] = [0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11];
44

5-
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
5+
#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "wasm32"))]
66
/// Inverse of PIVOT_ABCD
77
const INVERSE_PIVOT_ABCD: [usize; 16] = const {
88
let mut index = [0; 16];
@@ -41,7 +41,11 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
4141

4242
let len = b.len();
4343

44-
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
44+
#[cfg(any(
45+
target_arch = "x86",
46+
target_arch = "x86_64",
47+
all(target_arch = "wasm32", target_feature = "simd128")
48+
))]
4549
for chunk in b.chunks_exact_mut(64) {
4650
let mut t = [0u32; 16];
4751
for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
@@ -55,25 +59,45 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
5559
for chunk in v.chunks_mut(len) {
5660
chunk.copy_from_slice(b);
5761

58-
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
62+
#[cfg(any(
63+
target_arch = "x86",
64+
target_arch = "x86_64",
65+
all(target_arch = "wasm32", target_feature = "simd128")
66+
))]
5967
scrypt_block_mix_abcd(chunk, b);
6068

61-
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
69+
#[cfg(not(any(
70+
target_arch = "x86",
71+
target_arch = "x86_64",
72+
all(target_arch = "wasm32", target_feature = "simd128")
73+
)))]
6274
scrypt_block_mix(chunk, b);
6375
}
6476

6577
for _ in 0..n {
6678
let j = integerify(b, n);
6779
xor(b, &v[j * len..(j + 1) * len], t);
6880

69-
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
81+
#[cfg(any(
82+
target_arch = "x86",
83+
target_arch = "x86_64",
84+
all(target_arch = "wasm32", target_feature = "simd128")
85+
))]
7086
scrypt_block_mix_abcd(t, b);
7187

72-
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
88+
#[cfg(not(any(
89+
target_arch = "x86",
90+
target_arch = "x86_64",
91+
all(target_arch = "wasm32", target_feature = "simd128")
92+
)))]
7393
scrypt_block_mix(t, b);
7494
}
7595

76-
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
96+
#[cfg(any(
97+
target_arch = "x86",
98+
target_arch = "x86_64",
99+
all(target_arch = "wasm32", target_feature = "simd128")
100+
))]
77101
for chunk in b.chunks_exact_mut(64) {
78102
let mut t = [0u32; 16];
79103
for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
@@ -88,7 +112,11 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
88112
/// Execute the BlockMix operation
89113
/// input - the input vector. The length must be a multiple of 128.
90114
/// output - the output vector. Must be the same length as input.
91-
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
115+
#[cfg(not(any(
116+
target_arch = "x86",
117+
target_arch = "x86_64",
118+
all(target_arch = "wasm32", target_feature = "simd128")
119+
)))]
92120
fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
93121
use salsa20::{
94122
SalsaCore,
@@ -192,6 +220,72 @@ fn scrypt_block_mix_abcd(input: &[u8], output: &mut [u8]) {
192220
}
193221
}
194222

223+
/// Execute the BlockMix operation with pre-shuffled input.
224+
/// input - the input vector. The length must be a multiple of 128.
225+
/// output - the output vector. Must be the same length as input.
226+
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
227+
fn scrypt_block_mix_abcd(input: &[u8], output: &mut [u8]) {
228+
use core::arch::wasm32::*;
229+
230+
macro_rules! u32x4_rol {
231+
($x:expr, $amt:literal) => {
232+
v128_or(u32x4_shl($x, $amt), u32x4_shr($x, 32 - $amt))
233+
};
234+
}
235+
236+
let last_block = &input[input.len() - 64..];
237+
238+
let mut a = unsafe { v128_load(last_block.as_ptr().cast()) };
239+
let mut b = unsafe { v128_load(last_block.as_ptr().add(16).cast()) };
240+
let mut c = unsafe { v128_load(last_block.as_ptr().add(32).cast()) };
241+
let mut d = unsafe { v128_load(last_block.as_ptr().add(48).cast()) };
242+
243+
for (i, chunk) in input.chunks(64).enumerate() {
244+
let pos = if i % 2 == 0 {
245+
(i / 2) * 64
246+
} else {
247+
(i / 2) * 64 + input.len() / 2
248+
};
249+
250+
unsafe {
251+
let chunk_a = v128_load(chunk.as_ptr().cast());
252+
let chunk_b = v128_load(chunk.as_ptr().add(16).cast());
253+
let chunk_c = v128_load(chunk.as_ptr().add(32).cast());
254+
let chunk_d = v128_load(chunk.as_ptr().add(48).cast());
255+
256+
a = v128_xor(a, chunk_a);
257+
b = v128_xor(b, chunk_b);
258+
c = v128_xor(c, chunk_c);
259+
d = v128_xor(d, chunk_d);
260+
261+
let saves = [a, b, c, d];
262+
263+
for _ in 0..8 {
264+
b = v128_xor(b, u32x4_rol!(u32x4_add(a, d), 7));
265+
c = v128_xor(c, u32x4_rol!(u32x4_add(b, a), 9));
266+
d = v128_xor(d, u32x4_rol!(u32x4_add(c, b), 13));
267+
a = v128_xor(a, u32x4_rol!(u32x4_add(d, c), 18));
268+
269+
d = i32x4_shuffle::<1, 2, 3, 0>(d, d);
270+
c = i32x4_shuffle::<2, 3, 0, 1>(c, c);
271+
b = i32x4_shuffle::<3, 0, 1, 2>(b, b);
272+
273+
(b, d) = (d, b);
274+
}
275+
276+
a = u32x4_add(a, saves[0]);
277+
b = u32x4_add(b, saves[1]);
278+
c = u32x4_add(c, saves[2]);
279+
d = u32x4_add(d, saves[3]);
280+
281+
v128_store(output.as_mut_ptr().add(pos).cast(), a);
282+
v128_store(output.as_mut_ptr().add(pos + 16).cast(), b);
283+
v128_store(output.as_mut_ptr().add(pos + 32).cast(), c);
284+
v128_store(output.as_mut_ptr().add(pos + 48).cast(), d);
285+
}
286+
}
287+
}
288+
195289
fn xor(x: &[u8], y: &[u8], output: &mut [u8]) {
196290
for ((out, &x_i), &y_i) in output.iter_mut().zip(x.iter()).zip(y.iter()) {
197291
*out = x_i ^ y_i;

0 commit comments

Comments
 (0)