2626#include " absl/base/prefetch.h"
2727#include " absl/hash/internal/city.h"
2828
29+
30+ #ifdef ABSL_AES_INTERNAL_HAVE_X86_SIMD
31+ #error ABSL_AES_INTERNAL_HAVE_X86_SIMD cannot be directly set
32+ #elif defined(__SSE4_2__) && defined(__AES__)
33+ #define ABSL_AES_INTERNAL_HAVE_X86_SIMD
34+ #endif
35+
36+
37+ #ifdef ABSL_AES_INTERNAL_HAVE_X86_SIMD
38+ #include < smmintrin.h>
39+ #include < wmmintrin.h>
40+ #include < xmmintrin.h>
41+ #endif // ABSL_AES_INTERNAL_HAVE_X86_SIMD
42+
2943namespace absl {
3044ABSL_NAMESPACE_BEGIN
3145namespace hash_internal {
@@ -43,47 +57,86 @@ uint64_t Mix32Bytes(const uint8_t* ptr, uint64_t current_state) {
4357 return cs0 ^ cs1;
4458}
4559
46- [[maybe_unused]] uint64_t LowLevelHashLenGt32 ( const void * data, size_t len,
47- uint64_t seed) {
60+ # ifdef ABSL_AES_INTERNAL_HAVE_X86_SIMD
61+ uint64_t LowLevelHash33To64 ( const uint8_t * ptr, size_t len, uint64_t seed) {
4862 assert (len > 32 );
49- const uint8_t * ptr = static_cast <const uint8_t *>(data);
63+ assert (len <= 64 );
64+ __m128i state =
65+ _mm_set_epi64x (static_cast <int64_t >(seed), static_cast <int64_t >(len));
66+ auto a = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(ptr));
67+ auto b = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(ptr + 16 ));
68+ auto * last32_ptr = ptr + len - 32 ;
69+ auto c = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(last32_ptr));
70+ auto d = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(last32_ptr + 16 ));
71+
72+ // Bits of the second argument to _mm_aesdec_si128/_mm_aesenc_si128 are
73+ // XORed with the state argument after encryption.
74+ // We use each value as the first argument to shuffle all the bits around.
75+ // We do not add any salt to the state or loaded data, instead we vary
76+ // instructions used to mix bits _mm_aesdec_si128/_mm_aesenc_si128 and
77+ // _mm_add_epi64/_mm_sub_epi64.
78+ // _mm_add_epi64/_mm_sub_epi64 are combined to one instruction with data
79+ // loading like `vpaddq xmm1, xmm0, xmmword ptr [rdi]`.
80+ auto na = _mm_aesdec_si128 (_mm_add_epi64 (state, a), state);
81+ auto nb = _mm_aesdec_si128 (_mm_sub_epi64 (state, b), state);
82+ auto nc = _mm_aesenc_si128 (_mm_add_epi64 (state, c), state);
83+ auto nd = _mm_aesenc_si128 (_mm_sub_epi64 (state, d), state);
84+
85+ // We perform another round of encryption to mix bits between two halves of
86+ // the input.
87+ auto res128 = _mm_add_epi64 (_mm_aesenc_si128 (_mm_add_epi64 (na, nc), nd),
88+ _mm_aesdec_si128 (_mm_sub_epi64 (nb, nd), na));
89+ auto x64 = static_cast <uint64_t >(_mm_cvtsi128_si64 (res128));
90+ auto y64 = static_cast <uint64_t >(_mm_extract_epi64 (res128, 1 ));
91+ return x64 ^ y64;
92+ }
93+ #else
94+ uint64_t LowLevelHash33To64 (const uint8_t * ptr, size_t len, uint64_t seed) {
95+ assert (len > 32 );
96+ assert (len <= 64 );
5097 uint64_t current_state = seed ^ kStaticRandomData [0 ] ^ len;
5198 const uint8_t * last_32_ptr = ptr + len - 32 ;
99+ return Mix32Bytes (last_32_ptr, Mix32Bytes (ptr, current_state));
100+ }
101+ #endif // ABSL_AES_INTERNAL_HAVE_X86_SIMD
52102
53- if (len > 64 ) {
54- // If we have more than 64 bytes, we're going to handle chunks of 64
55- // bytes at a time. We're going to build up four separate hash states
56- // which we will then hash together. This avoids short dependency chains.
57- uint64_t duplicated_state0 = current_state;
58- uint64_t duplicated_state1 = current_state;
59- uint64_t duplicated_state2 = current_state;
60-
61- do {
62- PrefetchToLocalCache (ptr + 5 * ABSL_CACHELINE_SIZE);
63-
64- uint64_t a = absl::base_internal::UnalignedLoad64 (ptr);
65- uint64_t b = absl::base_internal::UnalignedLoad64 (ptr + 8 );
66- uint64_t c = absl::base_internal::UnalignedLoad64 (ptr + 16 );
67- uint64_t d = absl::base_internal::UnalignedLoad64 (ptr + 24 );
68- uint64_t e = absl::base_internal::UnalignedLoad64 (ptr + 32 );
69- uint64_t f = absl::base_internal::UnalignedLoad64 (ptr + 40 );
70- uint64_t g = absl::base_internal::UnalignedLoad64 (ptr + 48 );
71- uint64_t h = absl::base_internal::UnalignedLoad64 (ptr + 56 );
72-
73- current_state = Mix (a ^ kStaticRandomData [1 ], b ^ current_state);
74- duplicated_state0 = Mix (c ^ kStaticRandomData [2 ], d ^ duplicated_state0);
75-
76- duplicated_state1 = Mix (e ^ kStaticRandomData [3 ], f ^ duplicated_state1);
77- duplicated_state2 = Mix (g ^ kStaticRandomData [4 ], h ^ duplicated_state2);
78-
79- ptr += 64 ;
80- len -= 64 ;
81- } while (len > 64 );
82-
83- current_state = (current_state ^ duplicated_state0) ^
84- (duplicated_state1 + duplicated_state2);
85- }
86-
103+ [[maybe_unused]] ABSL_ATTRIBUTE_NOINLINE uint64_t
104+ LowLevelHashLenGt64 (const void * data, size_t len, uint64_t seed) {
105+ assert (len > 64 );
106+ const uint8_t * ptr = static_cast <const uint8_t *>(data);
107+ uint64_t current_state = seed ^ kStaticRandomData [0 ] ^ len;
108+ const uint8_t * last_32_ptr = ptr + len - 32 ;
109+ // If we have more than 64 bytes, we're going to handle chunks of 64
110+ // bytes at a time. We're going to build up four separate hash states
111+ // which we will then hash together. This avoids short dependency chains.
112+ uint64_t duplicated_state0 = current_state;
113+ uint64_t duplicated_state1 = current_state;
114+ uint64_t duplicated_state2 = current_state;
115+
116+ do {
117+ PrefetchToLocalCache (ptr + 5 * ABSL_CACHELINE_SIZE);
118+
119+ uint64_t a = absl::base_internal::UnalignedLoad64 (ptr);
120+ uint64_t b = absl::base_internal::UnalignedLoad64 (ptr + 8 );
121+ uint64_t c = absl::base_internal::UnalignedLoad64 (ptr + 16 );
122+ uint64_t d = absl::base_internal::UnalignedLoad64 (ptr + 24 );
123+ uint64_t e = absl::base_internal::UnalignedLoad64 (ptr + 32 );
124+ uint64_t f = absl::base_internal::UnalignedLoad64 (ptr + 40 );
125+ uint64_t g = absl::base_internal::UnalignedLoad64 (ptr + 48 );
126+ uint64_t h = absl::base_internal::UnalignedLoad64 (ptr + 56 );
127+
128+ current_state = Mix (a ^ kStaticRandomData [1 ], b ^ current_state);
129+ duplicated_state0 = Mix (c ^ kStaticRandomData [2 ], d ^ duplicated_state0);
130+
131+ duplicated_state1 = Mix (e ^ kStaticRandomData [3 ], f ^ duplicated_state1);
132+ duplicated_state2 = Mix (g ^ kStaticRandomData [4 ], h ^ duplicated_state2);
133+
134+ ptr += 64 ;
135+ len -= 64 ;
136+ } while (len > 64 );
137+
138+ current_state = (current_state ^ duplicated_state0) ^
139+ (duplicated_state1 + duplicated_state2);
87140 // We now have a data `ptr` with at most 64 bytes and the current state
88141 // of the hashing state machine stored in current_state.
89142 if (len > 32 ) {
@@ -96,6 +149,15 @@ uint64_t Mix32Bytes(const uint8_t* ptr, uint64_t current_state) {
96149 return Mix32Bytes (last_32_ptr, current_state);
97150}
98151
152+ [[maybe_unused]] uint64_t LowLevelHashLenGt32 (const void * data, size_t len,
153+ uint64_t seed) {
154+ assert (len > 32 );
155+ if (ABSL_PREDICT_FALSE (len > 64 )) {
156+ return LowLevelHashLenGt64 (data, len, seed);
157+ }
158+ return LowLevelHash33To64 (static_cast <const uint8_t *>(data), len, seed);
159+ }
160+
99161ABSL_ATTRIBUTE_ALWAYS_INLINE inline uint64_t HashBlockOn32Bit (
100162 const unsigned char * data, size_t len, uint64_t state) {
101163 // TODO(b/417141985): expose and use CityHash32WithSeed.
0 commit comments