Skip to content

Commit f33b7c0

Browse files
goldvitalycopybara-github
authored andcommitted
Special implementation for string hash with sizes in range [33, 64].
AES instructions are used, when available. We load 33 - 64 bytes of the string into 4 128-bit overlapping vectors and use two rounds of AES encrypt and decrypt to mix the bits. ``` name INSTRUCTIONS/op INSTRUCTIONS/op vs base BM_HASHING_Combine_contiguous_Fleet_hot 2.047 ± 0% 1.927 ± 0% -5.86% (p=0.000 n=30) BM_HASHING_Combine_contiguous_Fleet_cold 2.229 ± 0% 2.109 ± 0% -5.38% (p=0.000 n=30) name CYCLES/op CYCLES/op vs base BM_HASHING_Combine_contiguous_Fleet_hot 520.0m ± 0% 501.0m ± 1% -3.65% (p=0.000 n=30) BM_HASHING_Combine_contiguous_Fleet_cold 1.754 ± 1% 1.696 ± 2% -3.33% (p=0.003 n=30) BM_HASHING_Combine_contiguous_0_hot 557.5m ± 0% 542.5m ± 0% -2.69% (p=0.000 n=30) BM_HASHING_Combine_contiguous_0_cold 1.769 ± 2% 1.742 ± 2% ~ (p=0.117 n=30) BM_HASHING_Combine_contiguous_1_hot 389.0m ± 0% 371.0m ± 1% -4.63% (p=0.000 n=30) BM_HASHING_Combine_contiguous_1_cold 1.450 ± 1% 1.389 ± 2% -4.24% (p=0.000 n=30) BM_HASHING_Combine_contiguous_2_hot 555.0m ± 0% 547.0m ± 0% -1.44% (p=0.000 n=30) BM_HASHING_Combine_contiguous_2_cold 1.526 ± 2% 1.504 ± 2% -1.41% (p=0.024 n=30) BM_HASHING_Combine_contiguous_3_hot 507.0m ± 0% 470.5m ± 1% -7.20% (p=0.000 n=30) BM_HASHING_Combine_contiguous_3_cold 1.187 ± 1% 1.139 ± 1% -4.05% (p=0.000 n=30) geomean 873.5m 843.5m -3.43% ``` PiperOrigin-RevId: 836234239 Change-Id: I4a2344d77becefd3d5e788790f913fbe3611e8b0
1 parent 69e7e0a commit f33b7c0

File tree

2 files changed

+134
-37
lines changed

2 files changed

+134
-37
lines changed

absl/hash/internal/hash.cc

Lines changed: 99 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,20 @@
2626
#include "absl/base/prefetch.h"
2727
#include "absl/hash/internal/city.h"
2828

29+
30+
#ifdef ABSL_AES_INTERNAL_HAVE_X86_SIMD
31+
#error ABSL_AES_INTERNAL_HAVE_X86_SIMD cannot be directly set
32+
#elif defined(__SSE4_2__) && defined(__AES__)
33+
#define ABSL_AES_INTERNAL_HAVE_X86_SIMD
34+
#endif
35+
36+
37+
#ifdef ABSL_AES_INTERNAL_HAVE_X86_SIMD
38+
#include <smmintrin.h>
39+
#include <wmmintrin.h>
40+
#include <xmmintrin.h>
41+
#endif // ABSL_AES_INTERNAL_HAVE_X86_SIMD
42+
2943
namespace absl {
3044
ABSL_NAMESPACE_BEGIN
3145
namespace hash_internal {
@@ -43,47 +57,86 @@ uint64_t Mix32Bytes(const uint8_t* ptr, uint64_t current_state) {
4357
return cs0 ^ cs1;
4458
}
4559

46-
[[maybe_unused]] uint64_t LowLevelHashLenGt32(const void* data, size_t len,
47-
uint64_t seed) {
60+
#ifdef ABSL_AES_INTERNAL_HAVE_X86_SIMD
61+
uint64_t LowLevelHash33To64(const uint8_t* ptr, size_t len, uint64_t seed) {
4862
assert(len > 32);
49-
const uint8_t* ptr = static_cast<const uint8_t*>(data);
63+
assert(len <= 64);
64+
__m128i state =
65+
_mm_set_epi64x(static_cast<int64_t>(seed), static_cast<int64_t>(len));
66+
auto a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
67+
auto b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr + 16));
68+
auto* last32_ptr = ptr + len - 32;
69+
auto c = _mm_loadu_si128(reinterpret_cast<const __m128i*>(last32_ptr));
70+
auto d = _mm_loadu_si128(reinterpret_cast<const __m128i*>(last32_ptr + 16));
71+
72+
// Bits of the second argument to _mm_aesdec_si128/_mm_aesenc_si128 are
73+
// XORed with the state argument after encryption.
74+
// We use each value as the first argument to shuffle all the bits around.
75+
// We do not add any salt to the state or loaded data, instead we vary
76+
// instructions used to mix bits _mm_aesdec_si128/_mm_aesenc_si128 and
77+
// _mm_add_epi64/_mm_sub_epi64.
78+
// _mm_add_epi64/_mm_sub_epi64 are combined to one instruction with data
79+
// loading like `vpaddq xmm1, xmm0, xmmword ptr [rdi]`.
80+
auto na = _mm_aesdec_si128(_mm_add_epi64(state, a), state);
81+
auto nb = _mm_aesdec_si128(_mm_sub_epi64(state, b), state);
82+
auto nc = _mm_aesenc_si128(_mm_add_epi64(state, c), state);
83+
auto nd = _mm_aesenc_si128(_mm_sub_epi64(state, d), state);
84+
85+
// We perform another round of encryption to mix bits between two halves of
86+
// the input.
87+
auto res128 = _mm_add_epi64(_mm_aesenc_si128(_mm_add_epi64(na, nc), nd),
88+
_mm_aesdec_si128(_mm_sub_epi64(nb, nd), na));
89+
auto x64 = static_cast<uint64_t>(_mm_cvtsi128_si64(res128));
90+
auto y64 = static_cast<uint64_t>(_mm_extract_epi64(res128, 1));
91+
return x64 ^ y64;
92+
}
93+
#else
94+
uint64_t LowLevelHash33To64(const uint8_t* ptr, size_t len, uint64_t seed) {
95+
assert(len > 32);
96+
assert(len <= 64);
5097
uint64_t current_state = seed ^ kStaticRandomData[0] ^ len;
5198
const uint8_t* last_32_ptr = ptr + len - 32;
99+
return Mix32Bytes(last_32_ptr, Mix32Bytes(ptr, current_state));
100+
}
101+
#endif // ABSL_AES_INTERNAL_HAVE_X86_SIMD
52102

53-
if (len > 64) {
54-
// If we have more than 64 bytes, we're going to handle chunks of 64
55-
// bytes at a time. We're going to build up four separate hash states
56-
// which we will then hash together. This avoids short dependency chains.
57-
uint64_t duplicated_state0 = current_state;
58-
uint64_t duplicated_state1 = current_state;
59-
uint64_t duplicated_state2 = current_state;
60-
61-
do {
62-
PrefetchToLocalCache(ptr + 5 * ABSL_CACHELINE_SIZE);
63-
64-
uint64_t a = absl::base_internal::UnalignedLoad64(ptr);
65-
uint64_t b = absl::base_internal::UnalignedLoad64(ptr + 8);
66-
uint64_t c = absl::base_internal::UnalignedLoad64(ptr + 16);
67-
uint64_t d = absl::base_internal::UnalignedLoad64(ptr + 24);
68-
uint64_t e = absl::base_internal::UnalignedLoad64(ptr + 32);
69-
uint64_t f = absl::base_internal::UnalignedLoad64(ptr + 40);
70-
uint64_t g = absl::base_internal::UnalignedLoad64(ptr + 48);
71-
uint64_t h = absl::base_internal::UnalignedLoad64(ptr + 56);
72-
73-
current_state = Mix(a ^ kStaticRandomData[1], b ^ current_state);
74-
duplicated_state0 = Mix(c ^ kStaticRandomData[2], d ^ duplicated_state0);
75-
76-
duplicated_state1 = Mix(e ^ kStaticRandomData[3], f ^ duplicated_state1);
77-
duplicated_state2 = Mix(g ^ kStaticRandomData[4], h ^ duplicated_state2);
78-
79-
ptr += 64;
80-
len -= 64;
81-
} while (len > 64);
82-
83-
current_state = (current_state ^ duplicated_state0) ^
84-
(duplicated_state1 + duplicated_state2);
85-
}
86-
103+
[[maybe_unused]] ABSL_ATTRIBUTE_NOINLINE uint64_t
104+
LowLevelHashLenGt64(const void* data, size_t len, uint64_t seed) {
105+
assert(len > 64);
106+
const uint8_t* ptr = static_cast<const uint8_t*>(data);
107+
uint64_t current_state = seed ^ kStaticRandomData[0] ^ len;
108+
const uint8_t* last_32_ptr = ptr + len - 32;
109+
// If we have more than 64 bytes, we're going to handle chunks of 64
110+
// bytes at a time. We're going to build up four separate hash states
111+
// which we will then hash together. This avoids short dependency chains.
112+
uint64_t duplicated_state0 = current_state;
113+
uint64_t duplicated_state1 = current_state;
114+
uint64_t duplicated_state2 = current_state;
115+
116+
do {
117+
PrefetchToLocalCache(ptr + 5 * ABSL_CACHELINE_SIZE);
118+
119+
uint64_t a = absl::base_internal::UnalignedLoad64(ptr);
120+
uint64_t b = absl::base_internal::UnalignedLoad64(ptr + 8);
121+
uint64_t c = absl::base_internal::UnalignedLoad64(ptr + 16);
122+
uint64_t d = absl::base_internal::UnalignedLoad64(ptr + 24);
123+
uint64_t e = absl::base_internal::UnalignedLoad64(ptr + 32);
124+
uint64_t f = absl::base_internal::UnalignedLoad64(ptr + 40);
125+
uint64_t g = absl::base_internal::UnalignedLoad64(ptr + 48);
126+
uint64_t h = absl::base_internal::UnalignedLoad64(ptr + 56);
127+
128+
current_state = Mix(a ^ kStaticRandomData[1], b ^ current_state);
129+
duplicated_state0 = Mix(c ^ kStaticRandomData[2], d ^ duplicated_state0);
130+
131+
duplicated_state1 = Mix(e ^ kStaticRandomData[3], f ^ duplicated_state1);
132+
duplicated_state2 = Mix(g ^ kStaticRandomData[4], h ^ duplicated_state2);
133+
134+
ptr += 64;
135+
len -= 64;
136+
} while (len > 64);
137+
138+
current_state = (current_state ^ duplicated_state0) ^
139+
(duplicated_state1 + duplicated_state2);
87140
// We now have a data `ptr` with at most 64 bytes and the current state
88141
// of the hashing state machine stored in current_state.
89142
if (len > 32) {
@@ -96,6 +149,15 @@ uint64_t Mix32Bytes(const uint8_t* ptr, uint64_t current_state) {
96149
return Mix32Bytes(last_32_ptr, current_state);
97150
}
98151

152+
[[maybe_unused]] uint64_t LowLevelHashLenGt32(const void* data, size_t len,
153+
uint64_t seed) {
154+
assert(len > 32);
155+
if (ABSL_PREDICT_FALSE(len > 64)) {
156+
return LowLevelHashLenGt64(data, len, seed);
157+
}
158+
return LowLevelHash33To64(static_cast<const uint8_t*>(data), len, seed);
159+
}
160+
99161
ABSL_ATTRIBUTE_ALWAYS_INLINE inline uint64_t HashBlockOn32Bit(
100162
const unsigned char* data, size_t len, uint64_t state) {
101163
// TODO(b/417141985): expose and use CityHash32WithSeed.

absl/hash/internal/low_level_hash_test.cc

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,41 @@ TEST(LowLevelHashTest, VerifyGolden) {
364364
GTEST_SKIP()
365365
<< "We only maintain golden data for little endian 64 bit systems with "
366366
"128 bit intristics.";
367+
#elif defined(__SSE4_2__) && defined(__AES__)
368+
constexpr uint64_t kGolden[kNumGoldenOutputs] = {
369+
0xd6bdb2c9ba5e55f2, 0xffd3e23d4115a8ae, 0x2c3218ef486127de,
370+
0x554fa7f3a262b886, 0x06304cbf82e312d3, 0x490b3fb5af80622c,
371+
0x7398a90b8cc59c5d, 0x65fb3168b98030ab, 0xd4564363c53617bb,
372+
0x0545c26351925fe7, 0xc30700723b634bf4, 0xfb23a140a76dbe94,
373+
0x2fa1467fe218a47c, 0x92e05ec3a7b966eb, 0x6112b56e5624dd50,
374+
0x8760801365f9d722, 0x41f7187b61db0e5e, 0x7fe9188a1f5f50ad,
375+
0x25800bd4c2002ef1, 0x91fecd33a78ef0aa, 0x93986ad71e983613,
376+
0xe4c78173c7ea537b, 0x0bbdc2bcabdb50b1, 0xd9aa134df2d87623,
377+
0x6c4907c9477a9409, 0xc3e418a5dbda52e5, 0x4d24f3e9d0dda93a,
378+
0xcdb565a363dbe45f, 0xa95f228c8ee57478, 0x6b8f00bab5130227,
379+
0x2d05a0f44818b67a, 0xa64b55b071afbbea, 0xa205bfe6c724ce4d,
380+
0x69dd26ca8ac21744, 0xef80e2ff2f6a9bc0, 0xde266c0baa202c20,
381+
0xfa3463080ac74c50, 0x379d968a40125c2b, 0x4cbbd0a7b3c7d648,
382+
0xc92afd93f4c665d2, 0x6e28f5adb7ae38dc, 0x7c689c9c237be35e,
383+
0xaea41b29bd9d0f73, 0x832cef631d77e59f, 0x70cac8e87bc37dd3,
384+
0x8e8c98bbde68e764, 0xd6117aeb3ddedded, 0xd796ab808e766240,
385+
0x8953d0ea1a7d9814, 0xa212eba4281b391c, 0x21a555a8939ce597,
386+
0x809d31660f6d81a8, 0x2356524b20ab400f, 0x5bc611e1e49d0478,
387+
0xba9c065e2f385ce2, 0xb0a0fd12f4e83899, 0x14d076a35b1ff2ca,
388+
0x8acd0bb8cf9a93c0, 0xe62e8ec094039ee4, 0x38a536a7072bdc61,
389+
0xca256297602524f8, 0xfc62ebfb3530caeb, 0x8d8b0c05520569f6,
390+
0xbbaca65cf154c59d, 0x3739b5ada7e338d3, 0xdb9ea31f47365340,
391+
0x410b5c9c1da56755, 0x7e0abc03dbd10283, 0x136f87be70ed442e,
392+
0x6b727d4feddbe1e9, 0x074ebb21183b01df, 0x3fe92185b1985484,
393+
0xc5d8efd3c68305ca, 0xd9bada21b17e272e, 0x64d73133e1360f83,
394+
0xeb8563aa993e21f9, 0xe5e8da50cceab28f, 0x7a6f92eb3223d2f3,
395+
0xbdaf98370ea9b31b, 0x1682a84457f077bc, 0x4abd2d33b6e3be37,
396+
0xb35bc81a7c9d4c04, 0x3e5bde3fb7cfe63d, 0xff3abe6e2ffec974,
397+
0xb8116dd26cf6feec, 0x7a77a6e4ed0cf081, 0xb71eec2d5a184316,
398+
0x6fa932f77b4da817, 0x795f79b33909b2c4, 0x1b8755ef6b5eb34e,
399+
0x2255b72d7d6b2d79, 0xf2bdafafa90bd50a, 0x442a578f02cb1fc8,
400+
0xc25aefe55ecf83db, 0x3114c056f9c5a676,
401+
};
367402
#else
368403
constexpr uint64_t kGolden[kNumGoldenOutputs] = {
369404
0x669da02f8d009e0f, 0xceb19bf2255445cd, 0x0e746992d6d43a7c,

0 commit comments

Comments
 (0)