Skip to content

Commit 9a017e9

Browse files
scrypt: sse2 RoMix optimization
Signed-off-by: eternal-flame-AD <yume@yumechi.jp>
1 parent e75b27d commit 9a017e9

File tree

1 file changed

+116
-0
lines changed

1 file changed

+116
-0
lines changed

scrypt/src/romix.rs

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,26 @@
1+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
2+
/// Permute Salsa20 block to column major order
3+
const PIVOT_ABCD: [usize; 16] = [0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11];
4+
5+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6+
/// Inverse of PIVOT_ABCD
7+
const INVERSE_PIVOT_ABCD: [usize; 16] = const {
8+
let mut index = [0; 16];
9+
let mut i = 0;
10+
while i < 16 {
11+
let mut inverse = 0;
12+
while inverse < 16 {
13+
if PIVOT_ABCD[inverse] == i {
14+
index[i] = inverse;
15+
break;
16+
}
17+
inverse += 1;
18+
}
19+
i += 1;
20+
}
21+
index
22+
};
23+
124
/// Execute the ROMix operation in-place.
225
/// b - the data to operate on
326
/// v - a temporary variable to store the vector V
@@ -18,6 +41,17 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
1841

1942
let len = b.len();
2043

44+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
45+
for chunk in b.chunks_exact_mut(64) {
46+
let mut t = [0u32; 16];
47+
for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
48+
*b = u32::from_ne_bytes(c.try_into().unwrap());
49+
}
50+
chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
51+
b.copy_from_slice(&t[PIVOT_ABCD[i]].to_ne_bytes());
52+
});
53+
}
54+
2155
for chunk in v.chunks_mut(len) {
2256
chunk.copy_from_slice(b);
2357
scrypt_block_mix(chunk, b);
@@ -28,11 +62,23 @@ pub(crate) fn scrypt_ro_mix(b: &mut [u8], v: &mut [u8], t: &mut [u8], n: usize)
2862
xor(b, &v[j * len..(j + 1) * len], t);
2963
scrypt_block_mix(t, b);
3064
}
65+
66+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
67+
for chunk in b.chunks_exact_mut(64) {
68+
let mut t = [0u32; 16];
69+
for (c, b) in chunk.chunks_exact(4).zip(t.iter_mut()) {
70+
*b = u32::from_ne_bytes(c.try_into().unwrap());
71+
}
72+
chunk.chunks_exact_mut(4).enumerate().for_each(|(i, b)| {
73+
b.copy_from_slice(&t[INVERSE_PIVOT_ABCD[i]].to_ne_bytes());
74+
});
75+
}
3176
}
3277

3378
/// Execute the BlockMix operation
3479
/// input - the input vector. The length must be a multiple of 128.
3580
/// output - the output vector. Must be the same length as input.
81+
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
3682
fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
3783
use salsa20::{
3884
SalsaCore,
@@ -67,6 +113,76 @@ fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
67113
}
68114
}
69115

116+
/// Execute the BlockMix operation
117+
/// input - the input vector. The length must be a multiple of 128.
118+
/// output - the output vector. Must be the same length as input.
119+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
120+
fn scrypt_block_mix(input: &[u8], output: &mut [u8]) {
121+
#[cfg(target_arch = "x86")]
122+
use core::arch::x86::*;
123+
124+
#[cfg(target_arch = "x86_64")]
125+
use core::arch::x86_64::*;
126+
127+
macro_rules! mm_rol_epi32x {
128+
($w:expr, $amt:literal) => {{
129+
let w = $w;
130+
_mm_or_si128(_mm_slli_epi32(w, $amt), _mm_srli_epi32(w, 32 - $amt))
131+
}};
132+
}
133+
134+
let mut x = [0u8; 64];
135+
x.copy_from_slice(&input[input.len() - 64..]);
136+
137+
let mut a = unsafe { _mm_loadu_si128(x.as_ptr().cast()) };
138+
let mut b = unsafe { _mm_loadu_si128(x.as_ptr().add(16).cast()) };
139+
let mut c = unsafe { _mm_loadu_si128(x.as_ptr().add(32).cast()) };
140+
let mut d = unsafe { _mm_loadu_si128(x.as_ptr().add(48).cast()) };
141+
142+
for (i, chunk) in input.chunks(64).enumerate() {
143+
let pos = if i % 2 == 0 {
144+
(i / 2) * 64
145+
} else {
146+
(i / 2) * 64 + input.len() / 2
147+
};
148+
149+
unsafe {
150+
a = _mm_xor_si128(a, _mm_loadu_si128(chunk.as_ptr().cast()));
151+
b = _mm_xor_si128(b, _mm_loadu_si128(chunk.as_ptr().add(16).cast()));
152+
c = _mm_xor_si128(c, _mm_loadu_si128(chunk.as_ptr().add(32).cast()));
153+
d = _mm_xor_si128(d, _mm_loadu_si128(chunk.as_ptr().add(48).cast()));
154+
155+
let saves = [a, b, c, d];
156+
157+
for _ in 0..8 {
158+
b = _mm_xor_si128(b, mm_rol_epi32x!(_mm_add_epi32(a, d), 7));
159+
c = _mm_xor_si128(c, mm_rol_epi32x!(_mm_add_epi32(b, a), 9));
160+
d = _mm_xor_si128(d, mm_rol_epi32x!(_mm_add_epi32(c, b), 13));
161+
a = _mm_xor_si128(a, mm_rol_epi32x!(_mm_add_epi32(d, c), 18));
162+
163+
// a stays in place
164+
// b = left shuffle d by 1 element
165+
d = _mm_shuffle_epi32(d, 0b00_11_10_01);
166+
// c = left shuffle c by 2 elements
167+
c = _mm_shuffle_epi32(c, 0b01_00_11_10);
168+
// d = left shuffle b by 3 elements
169+
b = _mm_shuffle_epi32(b, 0b10_01_00_11);
170+
(b, d) = (d, b);
171+
}
172+
173+
a = _mm_add_epi32(a, saves[0]);
174+
b = _mm_add_epi32(b, saves[1]);
175+
c = _mm_add_epi32(c, saves[2]);
176+
d = _mm_add_epi32(d, saves[3]);
177+
178+
_mm_storeu_si128(output.as_mut_ptr().add(pos).cast(), a);
179+
_mm_storeu_si128(output.as_mut_ptr().add(pos + 16).cast(), b);
180+
_mm_storeu_si128(output.as_mut_ptr().add(pos + 32).cast(), c);
181+
_mm_storeu_si128(output.as_mut_ptr().add(pos + 48).cast(), d);
182+
}
183+
}
184+
}
185+
70186
fn xor(x: &[u8], y: &[u8], output: &mut [u8]) {
71187
for ((out, &x_i), &y_i) in output.iter_mut().zip(x.iter()).zip(y.iter()) {
72188
*out = x_i ^ y_i;

0 commit comments

Comments
 (0)