@@ -242,17 +242,21 @@ char *strrchr(const char *s, int c) {
242242// SIMDized check which bytes are in a set (Geoff Langdale)
243243// http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html
244244
245+ // This is the same algorithm as truffle from Hyperscan:
246+ // https://github.com/intel/hyperscan/blob/v5.4.2/src/nfa/truffle.c#L64-L81
247+ // https://github.com/intel/hyperscan/blob/v5.4.2/src/nfa/trufflecompile.cpp
248+
245249typedef struct {
246250 __u8x16 lo ;
247251 __u8x16 hi ;
248252} __wasm_v128_bitmap256_t ;
249253
250254__attribute__((always_inline ))
251- static void __wasm_v128_setbit (__wasm_v128_bitmap256_t * bitmap , int i ) {
252- uint8_t hi_nibble = ( uint8_t ) i >> 4 ;
253- uint8_t lo_nibble = ( uint8_t ) i & 0xf ;
254- bitmap -> lo [lo_nibble ] |= (uint8_t )(( uint32_t ) 1 << (hi_nibble - 0 ));
255- bitmap -> hi [lo_nibble ] |= (uint8_t )(( uint32_t ) 1 << (hi_nibble - 8 ));
255+ static void __wasm_v128_setbit (__wasm_v128_bitmap256_t * bitmap , uint8_t i ) {
256+ uint8_t hi_nibble = i >> 4 ;
257+ uint8_t lo_nibble = i & 0xf ;
258+ bitmap -> lo [lo_nibble ] |= (uint8_t )(1u << (hi_nibble - 0 ));
259+ bitmap -> hi [lo_nibble ] |= (uint8_t )(1u << (hi_nibble - 8 ));
256260}
257261
258262#ifndef __wasm_relaxed_simd__
@@ -264,18 +268,17 @@ static void __wasm_v128_setbit(__wasm_v128_bitmap256_t *bitmap, int i) {
264268__attribute__((always_inline ))
265269static v128_t __wasm_v128_chkbits (__wasm_v128_bitmap256_t bitmap , v128_t v ) {
266270 v128_t hi_nibbles = wasm_u8x16_shr (v , 4 );
267- v128_t bitmask_lookup = wasm_u8x16_const (1 , 2 , 4 , 8 , 16 , 32 , 64 , 128 , //
268- 1 , 2 , 4 , 8 , 16 , 32 , 64 , 128 );
271+ v128_t bitmask_lookup = wasm_u64x2_const_splat (0x8040201008040201 );
269272 v128_t bitmask = wasm_i8x16_relaxed_swizzle (bitmask_lookup , hi_nibbles );
270273
271274 v128_t indices_0_7 = v & wasm_u8x16_const_splat (0x8f );
272275 v128_t indices_8_15 = indices_0_7 ^ wasm_u8x16_const_splat (0x80 );
273276
274- v128_t row_0_7 = wasm_i8x16_swizzle (bitmap .lo , indices_0_7 );
275- v128_t row_8_15 = wasm_i8x16_swizzle (bitmap .hi , indices_8_15 );
277+ v128_t row_0_7 = wasm_i8x16_swizzle (( v128_t ) bitmap .lo , indices_0_7 );
278+ v128_t row_8_15 = wasm_i8x16_swizzle (( v128_t ) bitmap .hi , indices_8_15 );
276279
277280 v128_t bitsets = row_0_7 | row_8_15 ;
278- return wasm_i8x16_eq ( bitsets & bitmask , bitmask ) ;
281+ return bitsets & bitmask ;
279282}
280283
281284#undef wasm_i8x16_relaxed_swizzle
@@ -317,17 +320,18 @@ size_t strspn(const char *s, const char *c) {
317320
318321 for (; * c ; c ++ ) {
319322 // Terminator IS NOT on the bitmap.
320- __wasm_v128_setbit (& bitmap , * c );
323+ __wasm_v128_setbit (& bitmap , ( uint8_t ) * c );
321324 }
322325
323326 for (;;) {
324327 v128_t v = * (v128_t * )addr ;
325- v128_t cmp = __wasm_v128_chkbits (bitmap , v );
328+ v128_t found = __wasm_v128_chkbits (bitmap , v );
326329 // Bitmask is slow on AArch64, all_true is much faster.
327- if (!wasm_i8x16_all_true (cmp )) {
330+ if (!wasm_i8x16_all_true (found )) {
331+ v128_t cmp = wasm_i8x16_eq (found , (v128_t ){});
328332 // Clear the bits corresponding to align (little-endian)
329333 // so we can count trailing zeros.
330- int mask = ( uint16_t )~ wasm_i8x16_bitmask (cmp ) >> align << align ;
334+ int mask = wasm_i8x16_bitmask (cmp ) >> align << align ;
331335 // At least one bit will be set, unless align cleared them.
332336 // Knowing this helps the compiler if it unrolls the loop.
333337 __builtin_assume (mask || align );
@@ -356,17 +360,18 @@ size_t strcspn(const char *s, const char *c) {
356360
357361 do {
358362 // Terminator IS on the bitmap.
359- __wasm_v128_setbit (& bitmap , * c );
363+ __wasm_v128_setbit (& bitmap , ( uint8_t ) * c );
360364 } while (* c ++ );
361365
362366 for (;;) {
363367 v128_t v = * (v128_t * )addr ;
364- v128_t cmp = __wasm_v128_chkbits (bitmap , v );
368+ v128_t found = __wasm_v128_chkbits (bitmap , v );
365369 // Bitmask is slow on AArch64, any_true is much faster.
366- if (wasm_v128_any_true (cmp )) {
370+ if (wasm_v128_any_true (found )) {
371+ v128_t cmp = wasm_i8x16_eq (found , (v128_t ){});
367372 // Clear the bits corresponding to align (little-endian)
368373 // so we can count trailing zeros.
369- int mask = wasm_i8x16_bitmask (cmp ) >> align << align ;
374+ int mask = ( uint16_t )~ wasm_i8x16_bitmask (cmp ) >> align << align ;
370375 // At least one bit will be set, unless align cleared them.
371376 // Knowing this helps the compiler if it unrolls the loop.
372377 __builtin_assume (mask || align );
0 commit comments