Skip to content

Commit 61e47a4

Browse files
conNULLcopybara-github
authored andcommitted
Optimize crc32 V128_From2x64 on Arm
This removes redundant vector-vector moves and results in Extend being up to 3% faster. PiperOrigin-RevId: 621948170 Change-Id: Id82816aa6e294d34140ff591103cb20feac79d9a
1 parent 1ec4a27 commit 61e47a4

File tree

2 files changed

+15
-12
lines changed

2 files changed

+15
-12
lines changed

absl/crc/internal/crc32_x86_arm_combined_simd.h

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,11 @@ V128 V128_Xor(const V128 l, const V128 r);
102102
// Produces an AND operation of |l| and |r|.
103103
V128 V128_And(const V128 l, const V128 r);
104104

105-
// Sets two 64 bit integers to one 128 bit vector. The order is reverse.
105+
// Sets the lower half of a 128 bit register to the given 64-bit value and
106+
// zeroes the upper half.
106107
// dst[63:0] := |r|
107-
// dst[127:64] := |l|
108-
V128 V128_From2x64(const uint64_t l, const uint64_t r);
108+
// dst[127:64] := |0|
109+
V128 V128_From64WithZeroFill(const uint64_t r);
109110

110111
// Shift |l| right by |imm| bytes while shifting in zeros.
111112
template <int imm>
@@ -171,8 +172,8 @@ inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); }
171172

172173
inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); }
173174

174-
inline V128 V128_From2x64(const uint64_t l, const uint64_t r) {
175-
return _mm_set_epi64x(static_cast<int64_t>(l), static_cast<int64_t>(r));
175+
inline V128 V128_From64WithZeroFill(const uint64_t r) {
176+
return _mm_set_epi64x(static_cast<int64_t>(0), static_cast<int64_t>(r));
176177
}
177178

178179
template <int imm>
@@ -262,10 +263,12 @@ inline V128 V128_Xor(const V128 l, const V128 r) { return veorq_u64(l, r); }
262263

263264
inline V128 V128_And(const V128 l, const V128 r) { return vandq_u64(l, r); }
264265

265-
inline V128 V128_From2x64(const uint64_t l, const uint64_t r) {
266-
return vcombine_u64(vcreate_u64(r), vcreate_u64(l));
266+
inline V128 V128_From64WithZeroFill(const uint64_t r){
267+
constexpr uint64x2_t kZero = {0, 0};
268+
return vsetq_lane_u64(r, kZero, 0);
267269
}
268270

271+
269272
template <int imm>
270273
inline V128 V128_ShiftRight(const V128 l) {
271274
return vreinterpretq_u64_s8(

absl/crc/internal/crc_x86_arm_combined.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,9 @@ constexpr size_t kMediumCutoff = 2048;
101101
namespace {
102102

103103
uint32_t multiply(uint32_t a, uint32_t b) {
104-
V128 shifts = V128_From2x64(0, 1);
105-
V128 power = V128_From2x64(0, a);
106-
V128 crc = V128_From2x64(0, b);
104+
V128 shifts = V128_From64WithZeroFill(1);
105+
V128 power = V128_From64WithZeroFill(a);
106+
V128 crc = V128_From64WithZeroFill(b);
107107
V128 res = V128_PMulLow(power, crc);
108108

109109
// Combine crc values
@@ -444,11 +444,11 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
444444

445445
V128 magic = *(reinterpret_cast<const V128*>(kClmulConstants) + bs - 1);
446446

447-
V128 tmp = V128_From2x64(0, l64);
447+
V128 tmp = V128_From64WithZeroFill(l64);
448448

449449
V128 res1 = V128_PMulLow(tmp, magic);
450450

451-
tmp = V128_From2x64(0, l641);
451+
tmp = V128_From64WithZeroFill(l641);
452452

453453
V128 res2 = V128_PMul10(tmp, magic);
454454
V128 x = V128_Xor(res1, res2);

0 commit comments

Comments
 (0)