From 901445f00aac85eea06a48ffcfb32e54f46cb6c3 Mon Sep 17 00:00:00 2001 From: Bhavana Kilambi Date: Mon, 17 Nov 2025 15:00:51 +0000 Subject: [PATCH 1/2] Prepare base for SVE implementation in libsimdsort MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Separated the libsimdsort implementation for aarch64 and x86 in two different folders under src/java.base/linux/native/libsimdsort which might help in better future maintenance of AArch64 and x86 implementations. New layout - src/java.base/linux/native/libsimdsort/aarch64/… src/java.base/linux/native/libsimdsort/x86/… Moved the following files into the libsimdsort/x86 folder - src/java.base/linux/native/libsimdsort/x86/avx2-32bit-qsort.hpp src/java.base/linux/native/libsimdsort/x86/avx2-emu-funcs.hpp src/java.base/linux/native/libsimdsort/x86/avx2-linux-qsort.cpp src/java.base/linux/native/libsimdsort/x86/avx512-32bit-qsort.hpp src/java.base/linux/native/libsimdsort/x86/avx512-64bit-qsort.hpp src/java.base/linux/native/libsimdsort/x86/avx512-linux-qsort.cpp src/java.base/linux/native/libsimdsort/x86/simdsort-support.hpp src/java.base/linux/native/libsimdsort/x86/xss-common-includes.h src/java.base/linux/native/libsimdsort/x86/xss-common-qsort.h src/java.base/linux/native/libsimdsort/x86/xss-network-qsort.hpp src/java.base/linux/native/libsimdsort/x86/xss-optimal-networks.hpp src/java.base/linux/native/libsimdsort/x86/xss-pivot-selection.hpp Copied the following files from libsimdsort/x86 to libsimdsort/aarch64 folder - x86/xss-pivot-selection.hpp -> aarch64/pivot-selection.hpp x86/simdsort-support.hpp -> aarch64/simdsort-support.hpp x86/xss-common-qsort.h -> aarch64/sve-common-qsort.hpp x86/avx2-linux-qsort.cpp -> aarch64/sve-linux-qsort.cpp x86/avx2-32bit-qsort.hpp -> aarch64/sve-qsort.hpp --- .../pivot-selection.hpp} | 0 .../{ => aarch64}/simdsort-support.hpp | 0 .../sve-common-qsort.hpp} | 0 .../sve-linux-qsort.cpp} | 0 .../sve-qsort.hpp} | 0 .../libsimdsort/x86/avx2-32bit-qsort.hpp | 367 ++++++++++++ .../libsimdsort/{ => x86}/avx2-emu-funcs.hpp | 0 .../libsimdsort/x86/avx2-linux-qsort.cpp | 66 +++ .../{ => x86}/avx512-32bit-qsort.hpp | 0 .../{ => x86}/avx512-64bit-qsort.hpp | 0 .../{ => x86}/avx512-linux-qsort.cpp | 0 .../libsimdsort/x86/simdsort-support.hpp | 39 ++ .../{ => x86}/xss-common-includes.h | 0 .../native/libsimdsort/x86/xss-common-qsort.h | 528 ++++++++++++++++++ .../{ => x86}/xss-network-qsort.hpp | 0 .../{ => x86}/xss-optimal-networks.hpp | 0 .../libsimdsort/x86/xss-pivot-selection.hpp | 88 +++ 17 files changed, 1088 insertions(+) rename src/java.base/linux/native/libsimdsort/{xss-pivot-selection.hpp => aarch64/pivot-selection.hpp} (100%) rename src/java.base/linux/native/libsimdsort/{ => aarch64}/simdsort-support.hpp (100%) rename src/java.base/linux/native/libsimdsort/{xss-common-qsort.h => aarch64/sve-common-qsort.hpp} (100%) rename src/java.base/linux/native/libsimdsort/{avx2-linux-qsort.cpp => aarch64/sve-linux-qsort.cpp} (100%) rename src/java.base/linux/native/libsimdsort/{avx2-32bit-qsort.hpp => aarch64/sve-qsort.hpp} (100%) create mode 100644 src/java.base/linux/native/libsimdsort/x86/avx2-32bit-qsort.hpp rename src/java.base/linux/native/libsimdsort/{ => x86}/avx2-emu-funcs.hpp (100%) create mode 100644 src/java.base/linux/native/libsimdsort/x86/avx2-linux-qsort.cpp rename src/java.base/linux/native/libsimdsort/{ => x86}/avx512-32bit-qsort.hpp (100%) rename src/java.base/linux/native/libsimdsort/{ => x86}/avx512-64bit-qsort.hpp (100%) rename src/java.base/linux/native/libsimdsort/{ => x86}/avx512-linux-qsort.cpp (100%) create mode 100644 src/java.base/linux/native/libsimdsort/x86/simdsort-support.hpp rename src/java.base/linux/native/libsimdsort/{ => x86}/xss-common-includes.h (100%) create mode 100644 src/java.base/linux/native/libsimdsort/x86/xss-common-qsort.h rename src/java.base/linux/native/libsimdsort/{ => x86}/xss-network-qsort.hpp (100%) rename src/java.base/linux/native/libsimdsort/{ => x86}/xss-optimal-networks.hpp (100%) create mode 100644 src/java.base/linux/native/libsimdsort/x86/xss-pivot-selection.hpp diff --git a/src/java.base/linux/native/libsimdsort/xss-pivot-selection.hpp b/src/java.base/linux/native/libsimdsort/aarch64/pivot-selection.hpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/xss-pivot-selection.hpp rename to src/java.base/linux/native/libsimdsort/aarch64/pivot-selection.hpp diff --git a/src/java.base/linux/native/libsimdsort/simdsort-support.hpp b/src/java.base/linux/native/libsimdsort/aarch64/simdsort-support.hpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/simdsort-support.hpp rename to src/java.base/linux/native/libsimdsort/aarch64/simdsort-support.hpp diff --git a/src/java.base/linux/native/libsimdsort/xss-common-qsort.h b/src/java.base/linux/native/libsimdsort/aarch64/sve-common-qsort.hpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/xss-common-qsort.h rename to src/java.base/linux/native/libsimdsort/aarch64/sve-common-qsort.hpp diff --git a/src/java.base/linux/native/libsimdsort/avx2-linux-qsort.cpp b/src/java.base/linux/native/libsimdsort/aarch64/sve-linux-qsort.cpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/avx2-linux-qsort.cpp rename to src/java.base/linux/native/libsimdsort/aarch64/sve-linux-qsort.cpp diff --git a/src/java.base/linux/native/libsimdsort/avx2-32bit-qsort.hpp b/src/java.base/linux/native/libsimdsort/aarch64/sve-qsort.hpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/avx2-32bit-qsort.hpp rename to src/java.base/linux/native/libsimdsort/aarch64/sve-qsort.hpp diff --git a/src/java.base/linux/native/libsimdsort/x86/avx2-32bit-qsort.hpp b/src/java.base/linux/native/libsimdsort/x86/avx2-32bit-qsort.hpp new file mode 100644 index 0000000000000..9310b0098d808 --- /dev/null +++ b/src/java.base/linux/native/libsimdsort/x86/avx2-32bit-qsort.hpp @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. + * Copyright (c) 2021 Serge Sans Paille. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort) + +#ifndef AVX2_QSORT_32BIT +#define AVX2_QSORT_32BIT + +#include "avx2-emu-funcs.hpp" +#include "xss-common-qsort.h" + +/* + * Constants used in sorting 8 elements in a ymm registers. Based on Bitonic + * sorting network (see + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) + */ + +// ymm 7, 6, 5, 4, 3, 2, 1, 0 +#define NETWORK_32BIT_AVX2_1 4, 5, 6, 7, 0, 1, 2, 3 +#define NETWORK_32BIT_AVX2_2 0, 1, 2, 3, 4, 5, 6, 7 +#define NETWORK_32BIT_AVX2_3 5, 4, 7, 6, 1, 0, 3, 2 +#define NETWORK_32BIT_AVX2_4 3, 2, 1, 0, 7, 6, 5, 4 + +/* + * Assumes ymm is random and performs a full sorting network defined in + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg + */ +template +X86_SIMD_SORT_INLINE reg_t sort_ymm_32bit(reg_t ymm) { + const typename vtype::opmask_t oxAA = _mm256_set_epi32( + 0xFFFFFFFF, 0, 0xFFFFFFFF, 0, 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); + const typename vtype::opmask_t oxCC = _mm256_set_epi32( + 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0); + const typename vtype::opmask_t oxF0 = _mm256_set_epi32( + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0, 0); + + const typename vtype::ymmi_t rev_index = vtype::seti(NETWORK_32BIT_AVX2_2); + ymm = cmp_merge( + ymm, vtype::template shuffle(ymm), oxAA); + ymm = cmp_merge( + ymm, vtype::permutexvar(vtype::seti(NETWORK_32BIT_AVX2_1), ymm), oxCC); + ymm = cmp_merge( + ymm, vtype::template shuffle(ymm), oxAA); + ymm = cmp_merge(ymm, vtype::permutexvar(rev_index, ymm), oxF0); + ymm = cmp_merge( + ymm, vtype::permutexvar(vtype::seti(NETWORK_32BIT_AVX2_3), ymm), oxCC); + ymm = cmp_merge( + ymm, vtype::template shuffle(ymm), oxAA); + return ymm; +} + +struct avx2_32bit_swizzle_ops; + +template <> +struct avx2_vector { + using type_t = int32_t; + using reg_t = __m256i; + using ymmi_t = __m256i; + using opmask_t = __m256i; + static const uint8_t numlanes = 8; +#ifdef XSS_MINIMAL_NETWORK_SORT + static constexpr int network_sort_threshold = numlanes; +#else + static constexpr int network_sort_threshold = 256; +#endif + static constexpr int partition_unroll_factor = 4; + + using swizzle_ops = avx2_32bit_swizzle_ops; + + static type_t type_max() { return X86_SIMD_SORT_MAX_INT32; } + static type_t type_min() { return X86_SIMD_SORT_MIN_INT32; } + static reg_t zmm_max() { + return _mm256_set1_epi32(type_max()); + } // TODO: this should broadcast bits as is? + static opmask_t get_partial_loadmask(uint64_t num_to_read) { + auto mask = ((0x1ull << num_to_read) - 0x1ull); + return convert_int_to_avx2_mask(mask); + } + static ymmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, + int v8) { + return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); + } + static opmask_t kxor_opmask(opmask_t x, opmask_t y) { + return _mm256_xor_si256(x, y); + } + static opmask_t ge(reg_t x, reg_t y) { + opmask_t equal = eq(x, y); + opmask_t greater = _mm256_cmpgt_epi32(x, y); + return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(equal), + _mm256_castsi256_ps(greater))); + } + static opmask_t gt(reg_t x, reg_t y) { return _mm256_cmpgt_epi32(x, y); } + static opmask_t eq(reg_t x, reg_t y) { return _mm256_cmpeq_epi32(x, y); } + template + static reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, + void const *base) { + return _mm256_mask_i32gather_epi32(src, base, index, mask, scale); + } + template + static reg_t i64gather(__m256i index, void const *base) { + return _mm256_i32gather_epi32((int const *)base, index, scale); + } + static reg_t loadu(void const *mem) { + return _mm256_loadu_si256((reg_t const *)mem); + } + static reg_t max(reg_t x, reg_t y) { return _mm256_max_epi32(x, y); } + static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { + return avx2_emu_mask_compressstoreu32(mem, mask, x); + } + static reg_t maskz_loadu(opmask_t mask, void const *mem) { + return _mm256_maskload_epi32((const int *)mem, mask); + } + static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { + reg_t dst = _mm256_maskload_epi32((type_t *)mem, mask); + return mask_mov(x, mask, dst); + } + static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { + return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(x), + _mm256_castsi256_ps(y), + _mm256_castsi256_ps(mask))); + } + static void mask_storeu(void *mem, opmask_t mask, reg_t x) { + return _mm256_maskstore_epi32((type_t *)mem, mask, x); + } + static reg_t min(reg_t x, reg_t y) { return _mm256_min_epi32(x, y); } + static reg_t permutexvar(__m256i idx, reg_t ymm) { + return _mm256_permutevar8x32_epi32(ymm, idx); + // return avx2_emu_permutexvar_epi32(idx, ymm); + } + static reg_t permutevar(reg_t ymm, __m256i idx) { + return _mm256_permutevar8x32_epi32(ymm, idx); + } + static reg_t reverse(reg_t ymm) { + const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2); + return permutexvar(rev_index, ymm); + } + static type_t reducemax(reg_t v) { + return avx2_emu_reduce_max32(v); + } + static type_t reducemin(reg_t v) { + return avx2_emu_reduce_min32(v); + } + static reg_t set1(type_t v) { return _mm256_set1_epi32(v); } + template + static reg_t shuffle(reg_t ymm) { + return _mm256_shuffle_epi32(ymm, mask); + } + static void storeu(void *mem, reg_t x) { + _mm256_storeu_si256((__m256i *)mem, x); + } + static reg_t sort_vec(reg_t x) { + return sort_ymm_32bit>(x); + } + static reg_t cast_from(__m256i v) { return v; } + static __m256i cast_to(reg_t v) { return v; } + static int double_compressstore(type_t *left_addr, type_t *right_addr, + opmask_t k, reg_t reg) { + return avx2_double_compressstore32(left_addr, right_addr, k, + reg); + } +}; + +template <> +struct avx2_vector { + using type_t = float; + using reg_t = __m256; + using ymmi_t = __m256i; + using opmask_t = __m256i; + static const uint8_t numlanes = 8; +#ifdef XSS_MINIMAL_NETWORK_SORT + static constexpr int network_sort_threshold = numlanes; +#else + static constexpr int network_sort_threshold = 256; +#endif + static constexpr int partition_unroll_factor = 4; + + using swizzle_ops = avx2_32bit_swizzle_ops; + + static type_t type_max() { return X86_SIMD_SORT_INFINITYF; } + static type_t type_min() { return -X86_SIMD_SORT_INFINITYF; } + static reg_t zmm_max() { return _mm256_set1_ps(type_max()); } + + static ymmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, + int v8) { + return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); + } + + static reg_t maskz_loadu(opmask_t mask, void const *mem) { + return _mm256_maskload_ps((const float *)mem, mask); + } + static opmask_t ge(reg_t x, reg_t y) { + return _mm256_castps_si256(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); + } + static opmask_t gt(reg_t x, reg_t y) { + return _mm256_castps_si256(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); + } + static opmask_t eq(reg_t x, reg_t y) { + return _mm256_castps_si256(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); + } + static opmask_t get_partial_loadmask(uint64_t num_to_read) { + auto mask = ((0x1ull << num_to_read) - 0x1ull); + return convert_int_to_avx2_mask(mask); + } + static int32_t convert_mask_to_int(opmask_t mask) { + return convert_avx2_mask_to_int(mask); + } + template + static opmask_t fpclass(reg_t x) { + if constexpr (type == (0x01 | 0x80)) { + return _mm256_castps_si256(_mm256_cmp_ps(x, x, _CMP_UNORD_Q)); + } else { + static_assert(type == (0x01 | 0x80), "should not reach here"); + } + } + template + static reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, + void const *base) { + return _mm256_mask_i32gather_ps(src, base, index, + _mm256_castsi256_ps(mask), scale); + ; + } + template + static reg_t i64gather(__m256i index, void const *base) { + return _mm256_i32gather_ps((float *)base, index, scale); + } + static reg_t loadu(void const *mem) { + return _mm256_loadu_ps((float const *)mem); + } + static reg_t max(reg_t x, reg_t y) { return _mm256_max_ps(x, y); } + static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { + return avx2_emu_mask_compressstoreu32(mem, mask, x); + } + static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { + reg_t dst = _mm256_maskload_ps((type_t *)mem, mask); + return mask_mov(x, mask, dst); + } + static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { + return _mm256_blendv_ps(x, y, _mm256_castsi256_ps(mask)); + } + static void mask_storeu(void *mem, opmask_t mask, reg_t x) { + return _mm256_maskstore_ps((type_t *)mem, mask, x); + } + static reg_t min(reg_t x, reg_t y) { return _mm256_min_ps(x, y); } + static reg_t permutexvar(__m256i idx, reg_t ymm) { + return _mm256_permutevar8x32_ps(ymm, idx); + } + static reg_t permutevar(reg_t ymm, __m256i idx) { + return _mm256_permutevar8x32_ps(ymm, idx); + } + static reg_t reverse(reg_t ymm) { + const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2); + return permutexvar(rev_index, ymm); + } + static type_t reducemax(reg_t v) { + return avx2_emu_reduce_max32(v); + } + static type_t reducemin(reg_t v) { + return avx2_emu_reduce_min32(v); + } + static reg_t set1(type_t v) { return _mm256_set1_ps(v); } + template + static reg_t shuffle(reg_t ymm) { + return _mm256_castsi256_ps( + _mm256_shuffle_epi32(_mm256_castps_si256(ymm), mask)); + } + static void storeu(void *mem, reg_t x) { + _mm256_storeu_ps((float *)mem, x); + } + static reg_t sort_vec(reg_t x) { + return sort_ymm_32bit>(x); + } + static reg_t cast_from(__m256i v) { return _mm256_castsi256_ps(v); } + static __m256i cast_to(reg_t v) { return _mm256_castps_si256(v); } + static int double_compressstore(type_t *left_addr, type_t *right_addr, + opmask_t k, reg_t reg) { + return avx2_double_compressstore32(left_addr, right_addr, k, + reg); + } +}; + +struct avx2_32bit_swizzle_ops { + template + X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n( + typename vtype::reg_t reg) { + __m256i v = vtype::cast_to(reg); + + if constexpr (scale == 2) { + __m256 vf = _mm256_castsi256_ps(v); + vf = _mm256_permute_ps(vf, 0b10110001); + v = _mm256_castps_si256(vf); + } else if constexpr (scale == 4) { + __m256 vf = _mm256_castsi256_ps(v); + vf = _mm256_permute_ps(vf, 0b01001110); + v = _mm256_castps_si256(vf); + } else if constexpr (scale == 8) { + v = _mm256_permute2x128_si256(v, v, 0b00000001); + } else { + static_assert(scale == -1, "should not be reached"); + } + + return vtype::cast_from(v); + } + + template + X86_SIMD_SORT_INLINE typename vtype::reg_t reverse_n( + typename vtype::reg_t reg) { + __m256i v = vtype::cast_to(reg); + + if constexpr (scale == 2) { + return swap_n(reg); + } else if constexpr (scale == 4) { + constexpr uint64_t mask = 0b00011011; + __m256 vf = _mm256_castsi256_ps(v); + vf = _mm256_permute_ps(vf, mask); + v = _mm256_castps_si256(vf); + } else if constexpr (scale == 8) { + return vtype::reverse(reg); + } else { + static_assert(scale == -1, "should not be reached"); + } + + return vtype::cast_from(v); + } + + template + X86_SIMD_SORT_INLINE typename vtype::reg_t merge_n( + typename vtype::reg_t reg, typename vtype::reg_t other) { + __m256i v1 = vtype::cast_to(reg); + __m256i v2 = vtype::cast_to(other); + + if constexpr (scale == 2) { + v1 = _mm256_blend_epi32(v1, v2, 0b01010101); + } else if constexpr (scale == 4) { + v1 = _mm256_blend_epi32(v1, v2, 0b00110011); + } else if constexpr (scale == 8) { + v1 = _mm256_blend_epi32(v1, v2, 0b00001111); + } else { + static_assert(scale == -1, "should not be reached"); + } + + return vtype::cast_from(v1); + } +}; + +#endif // AVX2_QSORT_32BIT diff --git a/src/java.base/linux/native/libsimdsort/avx2-emu-funcs.hpp b/src/java.base/linux/native/libsimdsort/x86/avx2-emu-funcs.hpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/avx2-emu-funcs.hpp rename to src/java.base/linux/native/libsimdsort/x86/avx2-emu-funcs.hpp diff --git a/src/java.base/linux/native/libsimdsort/x86/avx2-linux-qsort.cpp b/src/java.base/linux/native/libsimdsort/x86/avx2-linux-qsort.cpp new file mode 100644 index 0000000000000..628d65077c701 --- /dev/null +++ b/src/java.base/linux/native/libsimdsort/x86/avx2-linux-qsort.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2023 Intel Corporation. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "simdsort-support.hpp" +#ifdef __SIMDSORT_SUPPORTED_LINUX + +#pragma GCC target("avx2") +#include "avx2-32bit-qsort.hpp" +#include "classfile_constants.h" + + +#define DLL_PUBLIC __attribute__((visibility("default"))) +#define INSERTION_SORT_THRESHOLD_32BIT 16 + +extern "C" { + + DLL_PUBLIC void avx2_sort(void *array, int elem_type, int32_t from_index, int32_t to_index) { + switch(elem_type) { + case JVM_T_INT: + avx2_fast_sort((int32_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT); + break; + case JVM_T_FLOAT: + avx2_fast_sort((float*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT); + break; + default: + assert(false, "Unexpected type"); + } + } + + DLL_PUBLIC void avx2_partition(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) { + switch(elem_type) { + case JVM_T_INT: + avx2_fast_partition((int32_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + break; + case JVM_T_FLOAT: + avx2_fast_partition((float*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + break; + default: + assert(false, "Unexpected type"); + } + } + +} + +#endif \ No newline at end of file diff --git a/src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libsimdsort/x86/avx512-32bit-qsort.hpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp rename to src/java.base/linux/native/libsimdsort/x86/avx512-32bit-qsort.hpp diff --git a/src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libsimdsort/x86/avx512-64bit-qsort.hpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp rename to src/java.base/linux/native/libsimdsort/x86/avx512-64bit-qsort.hpp diff --git a/src/java.base/linux/native/libsimdsort/avx512-linux-qsort.cpp b/src/java.base/linux/native/libsimdsort/x86/avx512-linux-qsort.cpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/avx512-linux-qsort.cpp rename to src/java.base/linux/native/libsimdsort/x86/avx512-linux-qsort.cpp diff --git a/src/java.base/linux/native/libsimdsort/x86/simdsort-support.hpp b/src/java.base/linux/native/libsimdsort/x86/simdsort-support.hpp new file mode 100644 index 0000000000000..f6946fdccec28 --- /dev/null +++ b/src/java.base/linux/native/libsimdsort/x86/simdsort-support.hpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2023 Intel Corporation. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef SIMDSORT_SUPPORT_HPP +#define SIMDSORT_SUPPORT_HPP +#include +#include + +#undef assert +#define assert(cond, msg) { if (!(cond)) { fprintf(stderr, "assert fails %s %d: %s\n", __FILE__, __LINE__, msg); abort(); }} + + +// GCC >= 9.1 is needed to build AVX2 portions of libsimdsort using C++17 features +#if defined(_LP64) && (defined(__GNUC__) && ((__GNUC__ > 9) || ((__GNUC__ == 9) && (__GNUC_MINOR__ >= 1)))) +#define __SIMDSORT_SUPPORTED_LINUX +#endif + +#endif //SIMDSORT_SUPPORT_HPP \ No newline at end of file diff --git a/src/java.base/linux/native/libsimdsort/xss-common-includes.h b/src/java.base/linux/native/libsimdsort/x86/xss-common-includes.h similarity index 100% rename from src/java.base/linux/native/libsimdsort/xss-common-includes.h rename to src/java.base/linux/native/libsimdsort/x86/xss-common-includes.h diff --git a/src/java.base/linux/native/libsimdsort/x86/xss-common-qsort.h b/src/java.base/linux/native/libsimdsort/x86/xss-common-qsort.h new file mode 100644 index 0000000000000..95fe8738d35e2 --- /dev/null +++ b/src/java.base/linux/native/libsimdsort/x86/xss-common-qsort.h @@ -0,0 +1,528 @@ +/* + * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. + * Copyright (c) 2021 Serge Sans Paille. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort) + +#ifndef XSS_COMMON_QSORT +#define XSS_COMMON_QSORT + +/* + * Quicksort using AVX-512. The ideas and code are based on these two research + * papers [1] and [2]. On a high level, the idea is to vectorize quicksort + * partitioning using AVX-512 compressstore instructions. If the array size is + * < 128, then use Bitonic sorting network implemented on 512-bit registers. + * The precise network definitions depend on the dtype and are defined in + * separate files: avx512-16bit-qsort.hpp, avx512-32bit-qsort.hpp and + * avx512-64bit-qsort.hpp. Article [4] is a good resource for bitonic sorting + * network. The core implementations of the vectorized qsort functions + * avx512_qsort(T*, arrsize_t) are modified versions of avx2 quicksort + * presented in the paper [2] and source code associated with that paper [3]. + * + * [1] Fast and Robust Vectorized In-Place Sorting of Primitive Types + * https://drops.dagstuhl.de/opus/volltexte/2021/13775/ + * + * [2] A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel + * Skylake https://arxiv.org/pdf/1704.08579.pdf + * + * [3] https://github.com/simd-sorting/fast-and-robust: SPDX-License-Identifier: + * MIT + * + * [4] https://mitp-content-server.mit.edu/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030 + * + */ + +#include "xss-common-includes.h" +#include "xss-pivot-selection.hpp" +#include "xss-network-qsort.hpp" + + +template +bool is_a_nan(T elem) { + return std::isnan(elem); +} + +template +X86_SIMD_SORT_INLINE T get_pivot_scalar(T *arr, const int64_t left, const int64_t right) { + // median of 8 equally spaced elements + int64_t NUM_ELEMENTS = 8; + int64_t MID = NUM_ELEMENTS / 2; + int64_t size = (right - left) / NUM_ELEMENTS; + T temp[NUM_ELEMENTS]; + for (int64_t i = 0; i < NUM_ELEMENTS; i++) temp[i] = arr[left + (i * size)]; + std::sort(temp, temp + NUM_ELEMENTS); + return temp[MID]; +} + +template +bool comparison_func_ge(const T &a, const T &b) { + return a < b; +} + +template +bool comparison_func_gt(const T &a, const T &b) { + return a <= b; +} + +/* + * COEX == Compare and Exchange two registers by swapping min and max values + */ +template +X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b) { + mm_t temp = a; + a = vtype::min(a, b); + b = vtype::max(temp, b); +} + +template +X86_SIMD_SORT_INLINE reg_t cmp_merge(reg_t in1, reg_t in2, opmask_t mask) { + reg_t min = vtype::min(in2, in1); + reg_t max = vtype::max(in2, in1); + return vtype::mask_mov(min, mask, max); // 0 -> min, 1 -> max +} + +template +int avx512_double_compressstore(type_t *left_addr, type_t *right_addr, + typename vtype::opmask_t k, reg_t reg) { + int amount_ge_pivot = _mm_popcnt_u32((int)k); + + vtype::mask_compressstoreu(left_addr, vtype::knot_opmask(k), reg); + vtype::mask_compressstoreu(right_addr + vtype::numlanes - amount_ge_pivot, + k, reg); + + return amount_ge_pivot; +} + +// Generic function dispatches to AVX2 or AVX512 code +template +X86_SIMD_SORT_INLINE arrsize_t partition_vec(type_t *l_store, type_t *r_store, + const reg_t curr_vec, + const reg_t pivot_vec, + reg_t &smallest_vec, + reg_t &biggest_vec, bool use_gt) { + //typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec); + typename vtype::opmask_t mask; + if (use_gt) mask = vtype::gt(curr_vec, pivot_vec); + else mask = vtype::ge(curr_vec, pivot_vec); + + int amount_ge_pivot = + vtype::double_compressstore(l_store, r_store, mask, curr_vec); + + smallest_vec = vtype::min(curr_vec, smallest_vec); + biggest_vec = vtype::max(curr_vec, biggest_vec); + + return amount_ge_pivot; +} + +/* + * Parition an array based on the pivot and returns the index of the + * first element that is greater than or equal to the pivot. + */ +template +X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr, arrsize_t left, + arrsize_t right, type_t pivot, + type_t *smallest, + type_t *biggest, + bool use_gt) { + auto comparison_func = use_gt ? comparison_func_gt : comparison_func_ge; + /* make array length divisible by vtype::numlanes , shortening the array */ + for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) { + *smallest = std::min(*smallest, arr[left], comparison_func); + *biggest = std::max(*biggest, arr[left], comparison_func); + if (!comparison_func(arr[left], pivot)) { + std::swap(arr[left], arr[--right]); + } else { + ++left; + } + } + + if (left == right) + return left; /* less than vtype::numlanes elements in the array */ + + using reg_t = typename vtype::reg_t; + reg_t pivot_vec = vtype::set1(pivot); + reg_t min_vec = vtype::set1(*smallest); + reg_t max_vec = vtype::set1(*biggest); + + if (right - left == vtype::numlanes) { + reg_t vec = vtype::loadu(arr + left); + arrsize_t unpartitioned = right - left - vtype::numlanes; + arrsize_t l_store = left; + + arrsize_t amount_ge_pivot = + partition_vec(arr + l_store, arr + l_store + unpartitioned, + vec, pivot_vec, min_vec, max_vec, use_gt); + l_store += (vtype::numlanes - amount_ge_pivot); + *smallest = vtype::reducemin(min_vec); + *biggest = vtype::reducemax(max_vec); + return l_store; + } + + // first and last vtype::numlanes values are partitioned at the end + reg_t vec_left = vtype::loadu(arr + left); + reg_t vec_right = vtype::loadu(arr + (right - vtype::numlanes)); + // store points of the vectors + arrsize_t unpartitioned = right - left - vtype::numlanes; + arrsize_t l_store = left; + // indices for loading the elements + left += vtype::numlanes; + right -= vtype::numlanes; + while (right - left != 0) { + reg_t curr_vec; + /* + * if fewer elements are stored on the right side of the array, + * then next elements are loaded from the right side, + * otherwise from the left side + */ + if ((l_store + unpartitioned + vtype::numlanes) - right < + left - l_store) { + right -= vtype::numlanes; + curr_vec = vtype::loadu(arr + right); + } else { + curr_vec = vtype::loadu(arr + left); + left += vtype::numlanes; + } + // partition the current vector and save it on both sides of the array + arrsize_t amount_ge_pivot = + partition_vec(arr + l_store, arr + l_store + unpartitioned, + curr_vec, pivot_vec, min_vec, max_vec, use_gt); + l_store += (vtype::numlanes - amount_ge_pivot); + unpartitioned -= vtype::numlanes; + } + + /* partition and save vec_left and vec_right */ + arrsize_t amount_ge_pivot = + partition_vec(arr + l_store, arr + l_store + unpartitioned, + vec_left, pivot_vec, min_vec, max_vec, use_gt); + l_store += (vtype::numlanes - amount_ge_pivot); + unpartitioned -= vtype::numlanes; + + amount_ge_pivot = + partition_vec(arr + l_store, arr + l_store + unpartitioned, + vec_right, pivot_vec, min_vec, max_vec, use_gt); + l_store += (vtype::numlanes - amount_ge_pivot); + unpartitioned -= vtype::numlanes; + + *smallest = vtype::reducemin(min_vec); + *biggest = vtype::reducemax(max_vec); + return l_store; +} + +template +X86_SIMD_SORT_INLINE arrsize_t +partition_avx512_unrolled(type_t *arr, arrsize_t left, arrsize_t right, + type_t pivot, type_t *smallest, type_t *biggest, bool use_gt) { + if constexpr (num_unroll == 0) { + return partition_avx512(arr, left, right, pivot, smallest, + biggest, use_gt); + } + + /* Use regular partition_avx512 for smaller arrays */ + if (right - left < 3 * num_unroll * vtype::numlanes) { + return partition_avx512(arr, left, right, pivot, smallest, + biggest, use_gt); + } + + auto comparison_func = use_gt ? comparison_func_gt : comparison_func_ge; + /* make array length divisible by vtype::numlanes, shortening the array */ + for (int32_t i = ((right - left) % (vtype::numlanes)); i > 0; --i) { + *smallest = std::min(*smallest, arr[left], comparison_func); + *biggest = std::max(*biggest, arr[left], comparison_func); + if (!comparison_func(arr[left], pivot)) { + std::swap(arr[left], arr[--right]); + } else { + ++left; + } + } + + arrsize_t unpartitioned = right - left - vtype::numlanes; + arrsize_t l_store = left; + + using reg_t = typename vtype::reg_t; + reg_t pivot_vec = vtype::set1(pivot); + reg_t min_vec = vtype::set1(*smallest); + reg_t max_vec = vtype::set1(*biggest); + + /* Calculate and load more registers to make the rest of the array a + * multiple of num_unroll. These registers will be partitioned at the very + * end. */ + int vecsToPartition = ((right - left) / vtype::numlanes) % num_unroll; + reg_t vec_align[num_unroll]; + for (int i = 0; i < vecsToPartition; i++) { + vec_align[i] = vtype::loadu(arr + left + i * vtype::numlanes); + } + left += vecsToPartition * vtype::numlanes; + + /* We will now have atleast 3*num_unroll registers worth of data to + * process. Load left and right vtype::numlanes*num_unroll values into + * registers to make space for in-place parition. The vec_left and + * vec_right registers are partitioned at the end */ + reg_t vec_left[num_unroll], vec_right[num_unroll]; + X86_SIMD_SORT_UNROLL_LOOP(8) + for (int ii = 0; ii < num_unroll; ++ii) { + vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii); + vec_right[ii] = + vtype::loadu(arr + (right - vtype::numlanes * (num_unroll - ii))); + } + /* indices for loading the elements */ + left += num_unroll * vtype::numlanes; + right -= num_unroll * vtype::numlanes; + while (right - left != 0) { + reg_t curr_vec[num_unroll]; + /* + * if fewer elements are stored on the right side of the array, + * then next elements are loaded from the right side, + * otherwise from the left side + */ + if ((l_store + unpartitioned + vtype::numlanes) - right < + left - l_store) { + right -= num_unroll * vtype::numlanes; + X86_SIMD_SORT_UNROLL_LOOP(8) + for (int ii = 0; ii < num_unroll; ++ii) { + curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes); + /* + * error: '_mm_prefetch' needs target feature mmx on clang-cl + */ +#if !(defined(_MSC_VER) && defined(__clang__)) + _mm_prefetch((char *)(arr + right + ii * vtype::numlanes - + num_unroll * vtype::numlanes), + _MM_HINT_T0); +#endif + } + } else { + X86_SIMD_SORT_UNROLL_LOOP(8) + for (int ii = 0; ii < num_unroll; ++ii) { + curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes); + /* + * error: '_mm_prefetch' needs target feature mmx on clang-cl + */ +#if !(defined(_MSC_VER) && defined(__clang__)) + _mm_prefetch((char *)(arr + left + ii * vtype::numlanes + + num_unroll * vtype::numlanes), + _MM_HINT_T0); +#endif + } + left += num_unroll * vtype::numlanes; + } + /* partition the current vector and save it on both sides of the array + * */ + X86_SIMD_SORT_UNROLL_LOOP(8) + for (int ii = 0; ii < num_unroll; ++ii) { + arrsize_t amount_ge_pivot = partition_vec( + arr + l_store, arr + l_store + unpartitioned, curr_vec[ii], + pivot_vec, min_vec, max_vec, use_gt); + l_store += (vtype::numlanes - amount_ge_pivot); + unpartitioned -= vtype::numlanes; + } + } + + /* partition and save vec_left[num_unroll] and vec_right[num_unroll] */ + X86_SIMD_SORT_UNROLL_LOOP(8) + for (int ii = 0; ii < num_unroll; ++ii) { + arrsize_t amount_ge_pivot = + partition_vec(arr + l_store, arr + l_store + unpartitioned, + vec_left[ii], pivot_vec, min_vec, max_vec, use_gt); + l_store += (vtype::numlanes - amount_ge_pivot); + unpartitioned -= vtype::numlanes; + } + X86_SIMD_SORT_UNROLL_LOOP(8) + for (int ii = 0; ii < num_unroll; ++ii) { + arrsize_t amount_ge_pivot = + partition_vec(arr + l_store, arr + l_store + unpartitioned, + vec_right[ii], pivot_vec, min_vec, max_vec, use_gt); + l_store += (vtype::numlanes - amount_ge_pivot); + unpartitioned -= vtype::numlanes; + } + + /* partition and save vec_align[vecsToPartition] */ + X86_SIMD_SORT_UNROLL_LOOP(8) + for (int ii = 0; ii < vecsToPartition; ++ii) { + arrsize_t amount_ge_pivot = + partition_vec(arr + l_store, arr + l_store + unpartitioned, + vec_align[ii], pivot_vec, min_vec, max_vec, use_gt); + l_store += (vtype::numlanes - amount_ge_pivot); + unpartitioned -= vtype::numlanes; + } + + *smallest = vtype::reducemin(min_vec); + *biggest = vtype::reducemax(max_vec); + return l_store; +} + +template +void sort_n(typename vtype::type_t *arr, int N); + +template +static void qsort_(type_t *arr, arrsize_t left, arrsize_t right, + arrsize_t max_iters) { + /* + * Resort to std::sort if quicksort isnt making any progress + */ + if (max_iters <= 0) { + std::sort(arr + left, arr + right + 1, comparison_func_ge); + return; + } + /* + * Base case: use bitonic networks to sort arrays <= + * vtype::network_sort_threshold + */ + if (right + 1 - left <= vtype::network_sort_threshold) { + sort_n( + arr + left, (int32_t)(right + 1 - left)); + return; + } + + type_t pivot = get_pivot_blocks(arr, left, right); + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + + arrsize_t pivot_index = + partition_avx512_unrolled( + arr, left, right + 1, pivot, &smallest, &biggest, false); + + if (pivot != smallest) + qsort_(arr, left, pivot_index - 1, max_iters - 1); + if (pivot != biggest) qsort_(arr, pivot_index, right, max_iters - 1); +} + +// Hooks for OpenJDK sort +// to_index (exclusive) +template +static int64_t vectorized_partition(type_t *arr, int64_t from_index, int64_t to_index, type_t pivot, bool use_gt) { + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + int64_t pivot_index = partition_avx512_unrolled( + arr, from_index, to_index, pivot, &smallest, &biggest, use_gt); + return pivot_index; +} + +// partitioning functions +template +X86_SIMD_SORT_INLINE void simd_dual_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2){ + const T pivot1 = arr[index_pivot1]; + const T pivot2 = arr[index_pivot2]; + + const int64_t low = from_index; + const int64_t high = to_index; + const int64_t start = low + 1; + const int64_t end = high - 1; + + + std::swap(arr[index_pivot1], arr[low]); + std::swap(arr[index_pivot2], arr[end]); + + + const int64_t pivot_index2 = vectorized_partition(arr, start, end, pivot2, true); // use_gt = true + std::swap(arr[end], arr[pivot_index2]); + int64_t upper = pivot_index2; + + // if all other elements are greater than pivot2 (and pivot1), no need to do further partitioning + if (upper == start) { + pivot_indices[0] = low; + pivot_indices[1] = upper; + return; + } + + const int64_t pivot_index1 = vectorized_partition(arr, start, upper, pivot1, false); // use_ge (use_gt = false) + int64_t lower = pivot_index1 - 1; + std::swap(arr[low], arr[lower]); + + pivot_indices[0] = lower; + pivot_indices[1] = upper; +} + +template +X86_SIMD_SORT_INLINE void simd_single_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot) { + const T pivot = arr[index_pivot]; + + const int64_t low = from_index; + const int64_t high = to_index; + const int64_t end = high - 1; + + + const int64_t pivot_index1 = vectorized_partition(arr, low, high, pivot, false); // use_gt = false (use_ge) + int64_t lower = pivot_index1; + + const int64_t pivot_index2 = vectorized_partition(arr, pivot_index1, high, pivot, true); // use_gt = true + int64_t upper = pivot_index2; + + pivot_indices[0] = lower; + pivot_indices[1] = upper; +} + +template +X86_SIMD_SORT_INLINE void simd_fast_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2) { + if (index_pivot1 != index_pivot2) { + simd_dual_pivot_partition(arr, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + } + else { + simd_single_pivot_partition(arr, from_index, to_index, pivot_indices, index_pivot1); + } +} + +template +X86_SIMD_SORT_INLINE void insertion_sort(T *arr, int32_t from_index, int32_t to_index) { + for (int i, k = from_index; ++k < to_index; ) { + T ai = arr[i = k]; + if (ai < arr[i - 1]) { + while (--i >= from_index && ai < arr[i]) { + arr[i + 1] = arr[i]; + } + arr[i + 1] = ai; + } + } +} + +template +X86_SIMD_SORT_INLINE void simd_fast_sort(T *arr, arrsize_t from_index, arrsize_t to_index, const arrsize_t INS_SORT_THRESHOLD) +{ + arrsize_t arrsize = to_index - from_index; + if (arrsize <= INS_SORT_THRESHOLD) { + insertion_sort(arr, from_index, to_index); + } else { + qsort_(arr, from_index, to_index - 1, 2 * (arrsize_t)log2(arrsize)); + } +} + +#define DEFINE_METHODS(ISA, VTYPE) \ + template \ + X86_SIMD_SORT_INLINE void ISA##_fast_sort( \ + T *arr, arrsize_t from_index, arrsize_t to_index, const arrsize_t INS_SORT_THRESHOLD) \ + { \ + simd_fast_sort(arr, from_index, to_index, INS_SORT_THRESHOLD); \ + } \ + template \ + X86_SIMD_SORT_INLINE void ISA##_fast_partition( \ + T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2) \ + { \ + simd_fast_partition(arr, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); \ + } + +DEFINE_METHODS(avx2, avx2_vector) +DEFINE_METHODS(avx512, zmm_vector) + +#endif // XSS_COMMON_QSORT diff --git a/src/java.base/linux/native/libsimdsort/xss-network-qsort.hpp b/src/java.base/linux/native/libsimdsort/x86/xss-network-qsort.hpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/xss-network-qsort.hpp rename to src/java.base/linux/native/libsimdsort/x86/xss-network-qsort.hpp diff --git a/src/java.base/linux/native/libsimdsort/xss-optimal-networks.hpp b/src/java.base/linux/native/libsimdsort/x86/xss-optimal-networks.hpp similarity index 100% rename from src/java.base/linux/native/libsimdsort/xss-optimal-networks.hpp rename to src/java.base/linux/native/libsimdsort/x86/xss-optimal-networks.hpp diff --git a/src/java.base/linux/native/libsimdsort/x86/xss-pivot-selection.hpp b/src/java.base/linux/native/libsimdsort/x86/xss-pivot-selection.hpp new file mode 100644 index 0000000000000..d65a30b56d6d6 --- /dev/null +++ b/src/java.base/linux/native/libsimdsort/x86/xss-pivot-selection.hpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. + * Copyright (c) 2021 Serge Sans Paille. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort) + +template +X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b); + +template +X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr, const arrsize_t left, + const arrsize_t right) { + using reg_t = typename vtype::reg_t; + type_t samples[vtype::numlanes]; + arrsize_t delta = (right - left) / vtype::numlanes; + for (int i = 0; i < vtype::numlanes; i++) { + samples[i] = arr[left + i * delta]; + } + reg_t rand_vec = vtype::loadu(samples); + reg_t sort = vtype::sort_vec(rand_vec); + + return ((type_t *)&sort)[vtype::numlanes / 2]; +} + +template +X86_SIMD_SORT_INLINE type_t get_pivot_blocks(type_t *arr, const arrsize_t left, + const arrsize_t right) { + if (right - left <= 1024) { + return get_pivot(arr, left, right); + } + + using reg_t = typename vtype::reg_t; + constexpr int numVecs = 5; + + arrsize_t width = (right - vtype::numlanes) - left; + arrsize_t delta = width / numVecs; + + reg_t vecs[numVecs]; + // Load data + for (int i = 0; i < numVecs; i++) { + vecs[i] = vtype::loadu(arr + left + delta * i); + } + + // Implement sorting network (from + // https://bertdobbelaere.github.io/sorting_networks.html) + COEX(vecs[0], vecs[3]); + COEX(vecs[1], vecs[4]); + + COEX(vecs[0], vecs[2]); + COEX(vecs[1], vecs[3]); + + COEX(vecs[0], vecs[1]); + COEX(vecs[2], vecs[4]); + + COEX(vecs[1], vecs[2]); + COEX(vecs[3], vecs[4]); + + COEX(vecs[2], vecs[3]); + + // Calculate median of the middle vector + reg_t &vec = vecs[numVecs / 2]; + vec = vtype::sort_vec(vec); + + type_t data[vtype::numlanes]; + vtype::storeu(data, vec); + return data[vtype::numlanes / 2]; +} From 513b1f1d82cf43610d649e2d4669f0d718420986 Mon Sep 17 00:00:00 2001 From: Bhavana Kilambi Date: Thu, 4 Dec 2025 16:12:40 +0000 Subject: [PATCH 2/2] AArch64 SVE implementation for Arrays.sort This patch adds an SVE implementation of primitive array sorting (Arrays.sort()) on AArch64 systems that support SVE. On non-SVE machines, we fall back to the existing Java implementation. For smaller arrays (length <= 64), we use insertion sort; for larger arrays we use an SVE-vectorized quicksort partitioner followed by an odd-even transposition cleanup pass. The SVE path is enabled by default for int type. For float type, it is available through the experimental flag : -XX:+UnlockExperimentalVMOptions -XX:+UseSVELibSimdSortForFP Without this flag being enabled, the default Java implementation would be executed for floats (the flag is disabled by default). Float is gated due to observed regressions on some small/medium sizes. On larger arrays, the SVE float path shows upto 1.47x speedup on Neoverse V2 and 2.12x on Neoverse V1. Following are the performance numbers for ArraysSort JMH benchmark - Case A: Ratio between the scores of master branch and UseSVELibSimdSortForFP flag disabled (which is the default). Case B: Ratio between the scores of master branch and UseSVELibSimdSortForFP flag enabled (the int numbers will be the same but this now enables SVE vectorized sorting for floats). We would want the ratios to be >= 1 to be at par or better than the default Java implementation (master branch). On Neoverse V1: Benchmark (size) Mode Cnt A B ArraysSort.floatParallelSort 10 avgt 3 0.98 0.98 ArraysSort.floatParallelSort 25 avgt 3 1.01 0.83 ArraysSort.floatParallelSort 50 avgt 3 0.99 0.55 ArraysSort.floatParallelSort 75 avgt 3 0.99 0.66 ArraysSort.floatParallelSort 100 avgt 3 0.98 0.66 ArraysSort.floatParallelSort 1000 avgt 3 1.00 0.84 ArraysSort.floatParallelSort 10000 avgt 3 1.03 1.52 ArraysSort.floatParallelSort 100000 avgt 3 1.03 1.46 ArraysSort.floatParallelSort 1000000 avgt 3 0.98 1.81 ArraysSort.floatSort 10 avgt 3 1.00 0.98 ArraysSort.floatSort 25 avgt 3 1.00 0.81 ArraysSort.floatSort 50 avgt 3 0.99 0.56 ArraysSort.floatSort 75 avgt 3 0.99 0.65 ArraysSort.floatSort 100 avgt 3 0.98 0.70 ArraysSort.floatSort 1000 avgt 3 0.99 0.84 ArraysSort.floatSort 10000 avgt 3 0.99 1.72 ArraysSort.floatSort 100000 avgt 3 1.00 1.94 ArraysSort.floatSort 1000000 avgt 3 1.00 2.13 ArraysSort.intParallelSort 10 avgt 3 1.08 1.08 ArraysSort.intParallelSort 25 avgt 3 1.04 1.05 ArraysSort.intParallelSort 50 avgt 3 1.29 1.30 ArraysSort.intParallelSort 75 avgt 3 1.16 1.16 ArraysSort.intParallelSort 100 avgt 3 1.07 1.07 ArraysSort.intParallelSort 1000 avgt 3 1.13 1.13 ArraysSort.intParallelSort 10000 avgt 3 1.49 1.38 ArraysSort.intParallelSort 100000 avgt 3 1.64 1.62 ArraysSort.intParallelSort 1000000 avgt 3 2.26 2.27 ArraysSort.intSort 10 avgt 3 1.08 1.08 ArraysSort.intSort 25 avgt 3 1.02 1.02 ArraysSort.intSort 50 avgt 3 1.25 1.25 ArraysSort.intSort 75 avgt 3 1.16 1.20 ArraysSort.intSort 100 avgt 3 1.07 1.07 ArraysSort.intSort 1000 avgt 3 1.12 1.13 ArraysSort.intSort 10000 avgt 3 1.94 1.95 ArraysSort.intSort 100000 avgt 3 1.86 1.86 ArraysSort.intSort 1000000 avgt 3 2.09 2.09 On Neoverse V2: Benchmark (size) Mode Cnt A B ArraysSort.floatParallelSort 10 avgt 3 1.02 1.02 ArraysSort.floatParallelSort 25 avgt 3 0.97 0.71 ArraysSort.floatParallelSort 50 avgt 3 0.94 0.65 ArraysSort.floatParallelSort 75 avgt 3 0.96 0.82 ArraysSort.floatParallelSort 100 avgt 3 0.95 0.84 ArraysSort.floatParallelSort 1000 avgt 3 1.01 0.94 ArraysSort.floatParallelSort 10000 avgt 3 1.01 1.25 ArraysSort.floatParallelSort 100000 avgt 3 1.01 1.09 ArraysSort.floatParallelSort 1000000 avgt 3 1.00 1.10 ArraysSort.floatSort 10 avgt 3 1.02 1.00 ArraysSort.floatSort 25 avgt 3 0.99 0.76 ArraysSort.floatSort 50 avgt 3 0.97 0.66 ArraysSort.floatSort 75 avgt 3 1.01 0.83 ArraysSort.floatSort 100 avgt 3 1.00 0.85 ArraysSort.floatSort 1000 avgt 3 0.99 0.93 ArraysSort.floatSort 10000 avgt 3 1.00 1.28 ArraysSort.floatSort 100000 avgt 3 1.00 1.37 ArraysSort.floatSort 1000000 avgt 3 1.00 1.48 ArraysSort.intParallelSort 10 avgt 3 1.05 1.05 ArraysSort.intParallelSort 25 avgt 3 0.99 0.84 ArraysSort.intParallelSort 50 avgt 3 1.03 1.14 ArraysSort.intParallelSort 75 avgt 3 0.91 0.99 ArraysSort.intParallelSort 100 avgt 3 0.98 0.96 ArraysSort.intParallelSort 1000 avgt 3 1.32 1.30 ArraysSort.intParallelSort 10000 avgt 3 1.40 1.40 ArraysSort.intParallelSort 100000 avgt 3 1.00 1.04 ArraysSort.intParallelSort 1000000 avgt 3 1.15 1.14 ArraysSort.intSort 10 avgt 3 1.05 1.05 ArraysSort.intSort 25 avgt 3 1.03 1.03 ArraysSort.intSort 50 avgt 3 1.08 1.14 ArraysSort.intSort 75 avgt 3 0.88 0.98 ArraysSort.intSort 100 avgt 3 1.01 0.99 ArraysSort.intSort 1000 avgt 3 1.3 1.32 ArraysSort.intSort 10000 avgt 3 1.43 1.43 ArraysSort.intSort 100000 avgt 3 1.30 1.30 ArraysSort.intSort 1000000 avgt 3 1.37 1.37 --- make/modules/java.base/Lib.gmk | 19 + src/hotspot/cpu/aarch64/globals_aarch64.hpp | 4 +- src/hotspot/cpu/aarch64/matcher_aarch64.hpp | 7 + .../cpu/aarch64/stubGenerator_aarch64.cpp | 4 + src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 20 +- src/hotspot/share/runtime/stubRoutines.cpp | 32 + src/hotspot/share/runtime/stubRoutines.hpp | 4 + .../libsimdsort/aarch64/pivot-selection.hpp | 76 +-- .../libsimdsort/aarch64/simdsort-support.hpp | 9 +- .../libsimdsort/aarch64/sve-common-qsort.hpp | 620 +++++++++--------- .../native/libsimdsort/aarch64/sve-config.hpp | 55 ++ .../libsimdsort/aarch64/sve-linux-qsort.cpp | 33 +- .../libsimdsort/aarch64/sve-oet-sort.hpp | 52 ++ .../native/libsimdsort/aarch64/sve-qsort.hpp | 445 +++++-------- 14 files changed, 686 insertions(+), 694 deletions(-) create mode 100644 src/java.base/linux/native/libsimdsort/aarch64/sve-config.hpp create mode 100644 src/java.base/linux/native/libsimdsort/aarch64/sve-oet-sort.hpp diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk index 41da22f8cb2f4..72dd0fe6c989e 100644 --- a/make/modules/java.base/Lib.gmk +++ b/make/modules/java.base/Lib.gmk @@ -187,6 +187,7 @@ ifeq ($(ENABLE_FALLBACK_LINKER), true) TARGETS += $(BUILD_LIBFALLBACKLINKER) endif +SIMDSORT_BASE_DIR := $(TOPDIR)/src/java.base/linux/native/libsimdsort ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)+$(filter $(TOOLCHAIN_TYPE), gcc), true+true+true+gcc) ############################################################################## ## Build libsimdsort @@ -196,6 +197,7 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2) NAME := simdsort, \ LINK_TYPE := C++, \ OPTIMIZATION := HIGH, \ + SRC := $(SIMDSORT_BASE_DIR)/x86, \ CXXFLAGS := -std=c++17, \ DISABLED_WARNINGS_gcc := unused-variable, \ LIBS_linux := $(LIBM), \ @@ -204,4 +206,21 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2) TARGETS += $(BUILD_LIBSIMD_SORT) endif +ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, aarch64)+$(INCLUDE_COMPILER2)+$(filter $(TOOLCHAIN_TYPE), gcc), true+true+true+gcc) + $(eval $(call SetupJdkLibrary, BUILD_LIBSIMD_SORT, \ + NAME := simdsort, \ + TOOLCHAIN := TOOLCHAIN_LINK_CXX, \ + OPTIMIZATION := HIGH, \ + SRC := $(SIMDSORT_BASE_DIR)/aarch64, \ + CFLAGS := $(CFLAGS_JDKLIB) -march=armv8.2-a+sve, \ + CXXFLAGS := $(CXXFLAGS_JDKLIB) -march=armv8.2-a+sve -std=c++17, \ + LDFLAGS := $(LDFLAGS_JDKLIB) \ + $(call SET_SHARED_LIBRARY_ORIGIN), \ + LIBS := $(LIBCXX), \ + DISABLED_WARNINGS_gcc := unused-variable, \ + LIBS_linux := -lc -lm -ldl, \ + )) + + TARGETS += $(BUILD_LIBSIMD_SORT) +endif ################################################################################ diff --git a/src/hotspot/cpu/aarch64/globals_aarch64.hpp b/src/hotspot/cpu/aarch64/globals_aarch64.hpp index 8e520314c8b6f..4f6451462aa3f 100644 --- a/src/hotspot/cpu/aarch64/globals_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/globals_aarch64.hpp @@ -127,7 +127,9 @@ define_pd_global(intx, InlineSmallCode, 1000); "Branch Protection to use: none, standard, pac-ret") \ product(bool, AlwaysMergeDMB, true, DIAGNOSTIC, \ "Always merge DMB instructions in code emission") \ - + product(bool, UseSVELibSimdSortForFP, false, EXPERIMENTAL, \ + "Use SVE-based LibSimdSort for float type on SVE supporting " \ + "machines") \ // end of ARCH_FLAGS #endif // CPU_AARCH64_GLOBALS_AARCH64_HPP diff --git a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp index 0fbc2ef141e8b..a0d98f3304afe 100644 --- a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp @@ -197,6 +197,13 @@ // Is SIMD sort supported for this CPU? static bool supports_simd_sort(BasicType bt) { + // SIMD sort is supported only on SVE machines + if (VM_Version::supports_sve()) { + // Currently, only T_INT and T_FLOAT types are supported. + // However, T_FLOAT is supported only if the experimental + // flag - UseSVELibSimdSortForFP is enabled. + return (bt == T_INT || (bt == T_FLOAT && UseSVELibSimdSortForFP)); + } return false; } diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 7e2f333ba4086..8025aa0c9331e 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -11873,6 +11873,10 @@ class StubGenerator: public StubCodeGenerator { StubRoutines::_montgomerySquare = g.generate_multiply(); } + // Load sve_sort library on supported hardware to enable SIMD sort and partition intrinsics + if (VM_Version::supports_sve()) { + (void)StubRoutines::try_load_simdsort("sve_sort", "sve_partition"); + } #endif // COMPILER2 if (UseChaCha20Intrinsics) { diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index efb0411aa39df..0ba451166de6a 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -4303,22 +4303,10 @@ void StubGenerator::generate_compiler_stubs() { // Load x86_64_sort library on supported hardware to enable SIMD sort and partition intrinsics if (VM_Version::supports_avx512dq() || VM_Version::supports_avx2()) { - void *libsimdsort = nullptr; - char ebuf_[1024]; - char dll_name_simd_sort[JVM_MAXPATHLEN]; - if (os::dll_locate_lib(dll_name_simd_sort, sizeof(dll_name_simd_sort), Arguments::get_dll_dir(), "simdsort")) { - libsimdsort = os::dll_load(dll_name_simd_sort, ebuf_, sizeof ebuf_); - } - // Get addresses for SIMD sort and partition routines - if (libsimdsort != nullptr) { - log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "simdsort" JNI_LIB_SUFFIX, p2i(libsimdsort)); - - os::snprintf_checked(ebuf_, sizeof(ebuf_), VM_Version::supports_avx512_simd_sort() ? "avx512_sort" : "avx2_sort"); - StubRoutines::_array_sort = (address)os::dll_lookup(libsimdsort, ebuf_); - - os::snprintf_checked(ebuf_, sizeof(ebuf_), VM_Version::supports_avx512_simd_sort() ? "avx512_partition" : "avx2_partition"); - StubRoutines::_array_partition = (address)os::dll_lookup(libsimdsort, ebuf_); - } + const bool use_avx512 = VM_Version::supports_avx512_simd_sort(); + const char* sort_sym = use_avx512 ? "avx512_sort" : "avx2_sort"; + const char* partition_sym = use_avx512 ? "avx512_partition" : "avx2_partition"; + (void)StubRoutines::try_load_simdsort(sort_sym, partition_sym); } #endif // COMPILER2 diff --git a/src/hotspot/share/runtime/stubRoutines.cpp b/src/hotspot/share/runtime/stubRoutines.cpp index 5246613738e46..3131965bd63b1 100644 --- a/src/hotspot/share/runtime/stubRoutines.cpp +++ b/src/hotspot/share/runtime/stubRoutines.cpp @@ -469,6 +469,38 @@ StubRoutines::select_arraycopy_function(BasicType t, bool aligned, bool disjoint #undef RETURN_STUB_PARM } +bool StubRoutines::try_load_simdsort(const char* sort_sym, const char* partition_sym) { + void* libsimdsort = nullptr; + char ebuf_[1024]; + char dll_name_simd_sort[JVM_MAXPATHLEN]; + + if (os::dll_locate_lib(dll_name_simd_sort, sizeof(dll_name_simd_sort), + Arguments::get_dll_dir(), "simdsort")) { + libsimdsort = os::dll_load(dll_name_simd_sort, ebuf_, sizeof ebuf_); + } + + if (libsimdsort == nullptr) { + return false; + } + + // Get addresses for SIMD sort and partition routines + log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, + JNI_LIB_PREFIX "simdsort" JNI_LIB_SUFFIX, p2i(libsimdsort)); + address sort_addr = (address)os::dll_lookup(libsimdsort, sort_sym); + address partition_addr = (address)os::dll_lookup(libsimdsort, partition_sym); + + if (sort_addr == nullptr || partition_addr == nullptr) { + log_warning(library)("libsimdsort missing symbols: %s=" INTPTR_FORMAT ", %s=" INTPTR_FORMAT, + sort_sym, p2i(sort_addr), partition_sym, p2i(partition_addr)); + // If either of the addresses are null, return false. + return false; + } + + StubRoutines::_array_sort = sort_addr; + StubRoutines::_array_partition = partition_addr; + return true; +} + UnsafeMemoryAccessMark::UnsafeMemoryAccessMark(StubCodeGenerator* cgen, bool add_entry, bool continue_at_scope_end, address error_exit_pc) { _cgen = cgen; _ucm_entry = nullptr; diff --git a/src/hotspot/share/runtime/stubRoutines.hpp b/src/hotspot/share/runtime/stubRoutines.hpp index 97e3e46b87063..edcbad64a19a6 100644 --- a/src/hotspot/share/runtime/stubRoutines.hpp +++ b/src/hotspot/share/runtime/stubRoutines.hpp @@ -28,6 +28,7 @@ #include "code/codeBlob.hpp" #include "memory/allocation.hpp" #include "prims/vectorSupport.hpp" +#include "runtime/arguments.hpp" #include "runtime/frame.hpp" #include "runtime/mutexLocker.hpp" #include "runtime/stubCodeGenerator.hpp" @@ -362,6 +363,9 @@ class StubRoutines: AllStatic { static void arrayof_oop_copy (HeapWord* src, HeapWord* dest, size_t count); static void arrayof_oop_copy_uninit(HeapWord* src, HeapWord* dest, size_t count); + // SIMD sort support. This method resolves the symbols - sort_sym, partition_sym + // and on success sets the StubRoutines::_array_sort/_array_partition and returns true. + static bool try_load_simdsort(const char* sort_sym, const char* partition_sym); }; #endif // SHARE_RUNTIME_STUBROUTINES_HPP diff --git a/src/java.base/linux/native/libsimdsort/aarch64/pivot-selection.hpp b/src/java.base/linux/native/libsimdsort/aarch64/pivot-selection.hpp index d65a30b56d6d6..848f8a8562d7d 100644 --- a/src/java.base/linux/native/libsimdsort/aarch64/pivot-selection.hpp +++ b/src/java.base/linux/native/libsimdsort/aarch64/pivot-selection.hpp @@ -1,6 +1,7 @@ /* * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. * Copyright (c) 2021 Serge Sans Paille. All rights reserved. + * Copyright 2025 Arm Limited and/or its affiliates. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,66 +24,31 @@ * */ -// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort) +#ifndef AARCH64_SVE_PIVOT_SELECTION_HPP +#define AARCH64_SVE_PIVOT_SELECTION_HPP -template -X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b); +#include +#include "sve-config.hpp" -template -X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr, const arrsize_t left, - const arrsize_t right) { - using reg_t = typename vtype::reg_t; - type_t samples[vtype::numlanes]; - arrsize_t delta = (right - left) / vtype::numlanes; - for (int i = 0; i < vtype::numlanes; i++) { - samples[i] = arr[left + i * delta]; - } - reg_t rand_vec = vtype::loadu(samples); - reg_t sort = vtype::sort_vec(rand_vec); - - return ((type_t *)&sort)[vtype::numlanes / 2]; -} +/* The current pivot selection method follows median-of-three method. + * Possible improvements could be the usage of sorting network (Compare and exchange sorting) + * for larger arrays. + */ template -X86_SIMD_SORT_INLINE type_t get_pivot_blocks(type_t *arr, const arrsize_t left, - const arrsize_t right) { - if (right - left <= 1024) { - return get_pivot(arr, left, right); - } - - using reg_t = typename vtype::reg_t; - constexpr int numVecs = 5; - - arrsize_t width = (right - vtype::numlanes) - left; - arrsize_t delta = width / numVecs; +static inline type_t get_pivot_blocks(type_t* arr, const arrsize_t left, const arrsize_t right) { + const arrsize_t len = right - left; + if (len < 64) return arr[left]; - reg_t vecs[numVecs]; - // Load data - for (int i = 0; i < numVecs; i++) { - vecs[i] = vtype::loadu(arr + left + delta * i); - } + const arrsize_t mid = left + (len / 2); + const type_t a = arr[left]; + const type_t b = arr[mid]; + const type_t c = arr[right - 1]; - // Implement sorting network (from - // https://bertdobbelaere.github.io/sorting_networks.html) - COEX(vecs[0], vecs[3]); - COEX(vecs[1], vecs[4]); + const type_t min_ab = std::min(a, b); + const type_t max_ab = std::max(a, b); - COEX(vecs[0], vecs[2]); - COEX(vecs[1], vecs[3]); - - COEX(vecs[0], vecs[1]); - COEX(vecs[2], vecs[4]); - - COEX(vecs[1], vecs[2]); - COEX(vecs[3], vecs[4]); - - COEX(vecs[2], vecs[3]); - - // Calculate median of the middle vector - reg_t &vec = vecs[numVecs / 2]; - vec = vtype::sort_vec(vec); - - type_t data[vtype::numlanes]; - vtype::storeu(data, vec); - return data[vtype::numlanes / 2]; + return std::min(max_ab, std::max(min_ab, c)); } + +#endif // AARCH64_SVE_PIVOT_SELECTION_HPP diff --git a/src/java.base/linux/native/libsimdsort/aarch64/simdsort-support.hpp b/src/java.base/linux/native/libsimdsort/aarch64/simdsort-support.hpp index f6946fdccec28..4773332f31281 100644 --- a/src/java.base/linux/native/libsimdsort/aarch64/simdsort-support.hpp +++ b/src/java.base/linux/native/libsimdsort/aarch64/simdsort-support.hpp @@ -1,5 +1,6 @@ /* * Copyright (c) 2023 Intel Corporation. All rights reserved. + * Copyright 2025 Arm Limited and/or its affiliates. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -30,10 +31,10 @@ #undef assert #define assert(cond, msg) { if (!(cond)) { fprintf(stderr, "assert fails %s %d: %s\n", __FILE__, __LINE__, msg); abort(); }} - -// GCC >= 9.1 is needed to build AVX2 portions of libsimdsort using C++17 features -#if defined(_LP64) && (defined(__GNUC__) && ((__GNUC__ > 9) || ((__GNUC__ == 9) && (__GNUC_MINOR__ >= 1)))) +// GCC >= 10.1 is required for a full support of ARM SVE ACLE intrinsics (which also includes the header file - arm_sve.h) +#if defined(__aarch64__) && defined(_LP64) && defined(__GNUC__) && \ + ((__GNUC__ > 10) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 1)) #define __SIMDSORT_SUPPORTED_LINUX #endif -#endif //SIMDSORT_SUPPORT_HPP \ No newline at end of file +#endif //SIMDSORT_SUPPORT_HPP diff --git a/src/java.base/linux/native/libsimdsort/aarch64/sve-common-qsort.hpp b/src/java.base/linux/native/libsimdsort/aarch64/sve-common-qsort.hpp index 95fe8738d35e2..d4f799b9eb858 100644 --- a/src/java.base/linux/native/libsimdsort/aarch64/sve-common-qsort.hpp +++ b/src/java.base/linux/native/libsimdsort/aarch64/sve-common-qsort.hpp @@ -1,6 +1,7 @@ /* * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. * Copyright (c) 2021 Serge Sans Paille. All rights reserved. + * Copyright 2025 Arm Limited and/or its affiliates. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,135 +24,76 @@ * */ -// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort) +#ifndef AARCH64_SVE_COMMON_QSORT_HPP +#define AARCH64_SVE_COMMON_QSORT_HPP +#include +#include +#include +#include -#ifndef XSS_COMMON_QSORT -#define XSS_COMMON_QSORT - -/* - * Quicksort using AVX-512. The ideas and code are based on these two research - * papers [1] and [2]. On a high level, the idea is to vectorize quicksort - * partitioning using AVX-512 compressstore instructions. If the array size is - * < 128, then use Bitonic sorting network implemented on 512-bit registers. - * The precise network definitions depend on the dtype and are defined in - * separate files: avx512-16bit-qsort.hpp, avx512-32bit-qsort.hpp and - * avx512-64bit-qsort.hpp. Article [4] is a good resource for bitonic sorting - * network. The core implementations of the vectorized qsort functions - * avx512_qsort(T*, arrsize_t) are modified versions of avx2 quicksort - * presented in the paper [2] and source code associated with that paper [3]. - * - * [1] Fast and Robust Vectorized In-Place Sorting of Primitive Types - * https://drops.dagstuhl.de/opus/volltexte/2021/13775/ - * - * [2] A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel - * Skylake https://arxiv.org/pdf/1704.08579.pdf - * - * [3] https://github.com/simd-sorting/fast-and-robust: SPDX-License-Identifier: - * MIT - * - * [4] https://mitp-content-server.mit.edu/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030 - * - */ - -#include "xss-common-includes.h" -#include "xss-pivot-selection.hpp" -#include "xss-network-qsort.hpp" - - -template -bool is_a_nan(T elem) { - return std::isnan(elem); -} - -template -X86_SIMD_SORT_INLINE T get_pivot_scalar(T *arr, const int64_t left, const int64_t right) { - // median of 8 equally spaced elements - int64_t NUM_ELEMENTS = 8; - int64_t MID = NUM_ELEMENTS / 2; - int64_t size = (right - left) / NUM_ELEMENTS; - T temp[NUM_ELEMENTS]; - for (int64_t i = 0; i < NUM_ELEMENTS; i++) temp[i] = arr[left + (i * size)]; - std::sort(temp, temp + NUM_ELEMENTS); - return temp[MID]; -} +#include "sve-config.hpp" +#include "classfile_constants.h" +#include "simdsort-support.hpp" +#include "sve-qsort.hpp" +#include "pivot-selection.hpp" +#include "sve-oet-sort.hpp" template -bool comparison_func_ge(const T &a, const T &b) { +bool sve_comparison_func_ge(const T &a, const T &b) { return a < b; } template -bool comparison_func_gt(const T &a, const T &b) { +bool sve_comparison_func_gt(const T &a, const T &b) { return a <= b; } /* - * COEX == Compare and Exchange two registers by swapping min and max values + * Partitions a single SIMD vector based on a pivot and returns the number + * of lanes greater than or equal to the pivot. */ -template -X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b) { - mm_t temp = a; - a = vtype::min(a, b); - b = vtype::max(temp, b); -} - -template -X86_SIMD_SORT_INLINE reg_t cmp_merge(reg_t in1, reg_t in2, opmask_t mask) { - reg_t min = vtype::min(in2, in1); - reg_t max = vtype::max(in2, in1); - return vtype::mask_mov(min, mask, max); // 0 -> min, 1 -> max -} - -template -int avx512_double_compressstore(type_t *left_addr, type_t *right_addr, - typename vtype::opmask_t k, reg_t reg) { - int amount_ge_pivot = _mm_popcnt_u32((int)k); - - vtype::mask_compressstoreu(left_addr, vtype::knot_opmask(k), reg); - vtype::mask_compressstoreu(right_addr + vtype::numlanes - amount_ge_pivot, - k, reg); - - return amount_ge_pivot; -} - -// Generic function dispatches to AVX2 or AVX512 code template -X86_SIMD_SORT_INLINE arrsize_t partition_vec(type_t *l_store, type_t *r_store, - const reg_t curr_vec, - const reg_t pivot_vec, - reg_t &smallest_vec, - reg_t &biggest_vec, bool use_gt) { - //typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec); +SVE_SORT_INLINE arrsize_t partition_vec(type_t *l_store, type_t *r_store, + const reg_t curr_vec, + const reg_t pivot_vec, + reg_t &smallest_vec, + reg_t &biggest_vec, bool use_gt) { typename vtype::opmask_t mask; - if (use_gt) mask = vtype::gt(curr_vec, pivot_vec); - else mask = vtype::ge(curr_vec, pivot_vec); + if (use_gt) { + mask = vtype::gt(curr_vec, pivot_vec); + } else { + mask = vtype::ge(curr_vec, pivot_vec); + } - int amount_ge_pivot = - vtype::double_compressstore(l_store, r_store, mask, curr_vec); + int amount_ge_pivot = vtype::double_compressstore(l_store, r_store, mask, curr_vec); smallest_vec = vtype::min(curr_vec, smallest_vec); - biggest_vec = vtype::max(curr_vec, biggest_vec); + biggest_vec = vtype::max(curr_vec, biggest_vec); return amount_ge_pivot; } /* - * Parition an array based on the pivot and returns the index of the + * Partition an array based on the pivot and returns the index of the * first element that is greater than or equal to the pivot. */ template -X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr, arrsize_t left, - arrsize_t right, type_t pivot, - type_t *smallest, - type_t *biggest, - bool use_gt) { - auto comparison_func = use_gt ? comparison_func_gt : comparison_func_ge; - /* make array length divisible by vtype::numlanes , shortening the array */ - for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) { +SVE_SORT_INLINE arrsize_t sve_vect_partition_(type_t *arr, arrsize_t left, + arrsize_t right, type_t pivot, + type_t *smallest, + type_t *biggest, + bool use_gt) { + auto comparison_func = use_gt ? sve_comparison_func_gt : sve_comparison_func_ge; + + // Store the number of lanes in a local variable + const arrsize_t num_lanes = vtype::numlanes(); + + /* make array length divisible by num_lanes, shortening the array */ + for (int32_t i = (right - left) % num_lanes; i > 0; --i) { *smallest = std::min(*smallest, arr[left], comparison_func); *biggest = std::max(*biggest, arr[left], comparison_func); + if (!comparison_func(arr[left], pivot)) { std::swap(arr[left], arr[--right]); } else { @@ -160,36 +102,43 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr, arrsize_t left, } if (left == right) - return left; /* less than vtype::numlanes elements in the array */ + return left; /* less than num_lanes elements in the array */ using reg_t = typename vtype::reg_t; + reg_t pivot_vec = vtype::set1(pivot); reg_t min_vec = vtype::set1(*smallest); reg_t max_vec = vtype::set1(*biggest); - if (right - left == vtype::numlanes) { + // If there is only num_lanes worth of elements to be sorted + if (right - left == num_lanes) { reg_t vec = vtype::loadu(arr + left); - arrsize_t unpartitioned = right - left - vtype::numlanes; arrsize_t l_store = left; + arrsize_t r_store = l_store; + + arrsize_t amount_ge_pivot = partition_vec(arr + l_store, + arr + r_store, + vec, pivot_vec, min_vec, max_vec, use_gt); - arrsize_t amount_ge_pivot = - partition_vec(arr + l_store, arr + l_store + unpartitioned, - vec, pivot_vec, min_vec, max_vec, use_gt); - l_store += (vtype::numlanes - amount_ge_pivot); + l_store += (num_lanes - amount_ge_pivot); *smallest = vtype::reducemin(min_vec); - *biggest = vtype::reducemax(max_vec); + *biggest = vtype::reducemax(max_vec); + return l_store; } - // first and last vtype::numlanes values are partitioned at the end + // first and last num_lanes values are partitioned at the end reg_t vec_left = vtype::loadu(arr + left); - reg_t vec_right = vtype::loadu(arr + (right - vtype::numlanes)); + reg_t vec_right = vtype::loadu(arr + (right - num_lanes)); + // store points of the vectors - arrsize_t unpartitioned = right - left - vtype::numlanes; arrsize_t l_store = left; + arrsize_t r_store = right - num_lanes; + // indices for loading the elements - left += vtype::numlanes; - right -= vtype::numlanes; + left += num_lanes; + right -= num_lanes; + while (right - left != 0) { reg_t curr_vec; /* @@ -197,61 +146,85 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr, arrsize_t left, * then next elements are loaded from the right side, * otherwise from the left side */ - if ((l_store + unpartitioned + vtype::numlanes) - right < - left - l_store) { - right -= vtype::numlanes; + if ((r_store + num_lanes) - right < left - l_store) { + right -= num_lanes; curr_vec = vtype::loadu(arr + right); } else { curr_vec = vtype::loadu(arr + left); - left += vtype::numlanes; + left += num_lanes; } // partition the current vector and save it on both sides of the array - arrsize_t amount_ge_pivot = - partition_vec(arr + l_store, arr + l_store + unpartitioned, - curr_vec, pivot_vec, min_vec, max_vec, use_gt); - l_store += (vtype::numlanes - amount_ge_pivot); - unpartitioned -= vtype::numlanes; + arrsize_t amount_ge_pivot = partition_vec(arr + l_store, + arr + r_store, + curr_vec, pivot_vec, min_vec, max_vec, use_gt); + l_store += (num_lanes - amount_ge_pivot); + r_store -= amount_ge_pivot; } /* partition and save vec_left and vec_right */ - arrsize_t amount_ge_pivot = - partition_vec(arr + l_store, arr + l_store + unpartitioned, - vec_left, pivot_vec, min_vec, max_vec, use_gt); - l_store += (vtype::numlanes - amount_ge_pivot); - unpartitioned -= vtype::numlanes; - - amount_ge_pivot = - partition_vec(arr + l_store, arr + l_store + unpartitioned, - vec_right, pivot_vec, min_vec, max_vec, use_gt); - l_store += (vtype::numlanes - amount_ge_pivot); - unpartitioned -= vtype::numlanes; + arrsize_t amount_ge_pivot = partition_vec(arr + l_store, + arr + r_store, + vec_left, pivot_vec, min_vec, max_vec, use_gt); + l_store += (num_lanes - amount_ge_pivot); + r_store -= amount_ge_pivot; + + + amount_ge_pivot = partition_vec(arr + l_store, + arr + r_store, + vec_right, pivot_vec, min_vec, max_vec, use_gt); + l_store += (num_lanes - amount_ge_pivot); + r_store -= amount_ge_pivot; *smallest = vtype::reducemin(min_vec); - *biggest = vtype::reducemax(max_vec); + *biggest = vtype::reducemax(max_vec); + return l_store; } -template -X86_SIMD_SORT_INLINE arrsize_t -partition_avx512_unrolled(type_t *arr, arrsize_t left, arrsize_t right, - type_t pivot, type_t *smallest, type_t *biggest, bool use_gt) { - if constexpr (num_unroll == 0) { - return partition_avx512(arr, left, right, pivot, smallest, - biggest, use_gt); +// Process a single vector for partitioning +template +SVE_SORT_INLINE void sve_partition_single_vec(type_t* arr, + arrsize_t& l_store, + arrsize_t& r_store, + typename vtype::reg_t v, + typename vtype::reg_t pivot_vec, + typename vtype::reg_t& min_vec, + typename vtype::reg_t& max_vec, + bool use_gt, arrsize_t num_lanes) { + arrsize_t amount_ge_pivot = partition_vec(arr + l_store, + arr + r_store, + v, pivot_vec, min_vec, max_vec, use_gt); + + l_store += num_lanes - amount_ge_pivot; + r_store -= amount_ge_pivot; +} + +// Unrolled version of sve_vect_partition_() with an UNROLL_FACTOR of either 2 or 4 +// The UNROLL_FACTOR is 2 if the vector length <= 16B and it is 4 if the vector length > 16B +template +SVE_SORT_INLINE arrsize_t +sve_partition_unrolled(type_t* arr, arrsize_t left, arrsize_t right, + type_t pivot, type_t* smallest, type_t* biggest, bool use_gt) { + static_assert(UNROLL_FACTOR == 2 || UNROLL_FACTOR == 4, "unsupported unroll factor"); + + const arrsize_t num_lanes = vtype::numlanes(); + + if constexpr (UNROLL_FACTOR == 0) { + return sve_vect_partition_(arr, left, right, pivot, smallest, biggest, use_gt); } - /* Use regular partition_avx512 for smaller arrays */ - if (right - left < 3 * num_unroll * vtype::numlanes) { - return partition_avx512(arr, left, right, pivot, smallest, - biggest, use_gt); + // use regular partition routine for small arrays + if (right - left < 3 * UNROLL_FACTOR * num_lanes) { + return sve_vect_partition_(arr, left, right, pivot, smallest, biggest, use_gt); } - auto comparison_func = use_gt ? comparison_func_gt : comparison_func_ge; - /* make array length divisible by vtype::numlanes, shortening the array */ - for (int32_t i = ((right - left) % (vtype::numlanes)); i > 0; --i) { + auto comparison_func = use_gt ? sve_comparison_func_gt + : sve_comparison_func_ge; + + // make array length divisible by num_lanes, shortening the array + for (int32_t i = (right - left) % num_lanes; i > 0; --i) { *smallest = std::min(*smallest, arr[left], comparison_func); - *biggest = std::max(*biggest, arr[left], comparison_func); + *biggest = std::max(*biggest, arr[left], comparison_func); if (!comparison_func(arr[left], pivot)) { std::swap(arr[left], arr[--right]); } else { @@ -259,170 +232,205 @@ partition_avx512_unrolled(type_t *arr, arrsize_t left, arrsize_t right, } } - arrsize_t unpartitioned = right - left - vtype::numlanes; arrsize_t l_store = left; + arrsize_t r_store = right - num_lanes; using reg_t = typename vtype::reg_t; reg_t pivot_vec = vtype::set1(pivot); - reg_t min_vec = vtype::set1(*smallest); - reg_t max_vec = vtype::set1(*biggest); + reg_t min_vec = vtype::set1(*smallest); + reg_t max_vec = vtype::set1(*biggest); /* Calculate and load more registers to make the rest of the array a * multiple of num_unroll. These registers will be partitioned at the very * end. */ - int vecsToPartition = ((right - left) / vtype::numlanes) % num_unroll; - reg_t vec_align[num_unroll]; - for (int i = 0; i < vecsToPartition; i++) { - vec_align[i] = vtype::loadu(arr + left + i * vtype::numlanes); - } - left += vecsToPartition * vtype::numlanes; - - /* We will now have atleast 3*num_unroll registers worth of data to - * process. Load left and right vtype::numlanes*num_unroll values into + int vecsToPartition = ((right - left) / num_lanes) % UNROLL_FACTOR; + +#define SVE_UNROLL_APPLY(OP) \ + do { \ + if constexpr (UNROLL_FACTOR >= 1) { OP(0); } \ + if constexpr (UNROLL_FACTOR >= 2) { OP(1); } \ + if constexpr (UNROLL_FACTOR >= 3) { OP(2); } \ + if constexpr (UNROLL_FACTOR >= 4) { OP(3); } \ + } while (false) + +#define SVE_DECLARE_REG_SET(NAME, INIT) \ + [[maybe_unused]] reg_t NAME##0 = (INIT); \ + [[maybe_unused]] reg_t NAME##1 = NAME##0; \ + [[maybe_unused]] reg_t NAME##2 = NAME##0; \ + [[maybe_unused]] reg_t NAME##3 = NAME##0 + +#define SVE_DECLARE_REG_SET_UNINIT(NAME) \ + reg_t NAME##0; \ + reg_t NAME##1; \ + reg_t NAME##2; \ + reg_t NAME##3 + +#define SVE_REG(NAME, IDX) NAME##IDX + +#define SVE_PARTITION_ONE(REG) \ + sve_partition_single_vec(arr, l_store, r_store, \ + REG, pivot_vec, min_vec, max_vec, \ + use_gt, num_lanes) + +#define SVE_LOAD_BLOCK_FROM(BASE_PTR, NAME, I) \ + SVE_REG(NAME, I) = vtype::loadu((BASE_PTR) + (I) * num_lanes) + +#define SVE_LOAD_TAIL(I) \ + do { \ + if (vecsToPartition > (I)) { \ + SVE_LOAD_BLOCK_FROM(arr + left, align_vec, I); \ + } \ + } while(false) + +#define SVE_LOAD_LEFT(I) \ + SVE_LOAD_BLOCK_FROM(arr + left, left_vec, I) + +#define SVE_LOAD_RIGHT(I) \ + SVE_LOAD_BLOCK_FROM(arr + right_load_start, right_vec, I) + +#define SVE_LOAD_BATCH_FROM_RIGHT(I) \ + SVE_LOAD_BLOCK_FROM(arr + right, curr_vec, I) + +#define SVE_LOAD_BATCH_FROM_LEFT(I) \ + SVE_LOAD_BLOCK_FROM(arr + left, curr_vec, I) + +#define SVE_PARTITION_BATCH(I) \ + SVE_PARTITION_ONE(SVE_REG(curr_vec, I)) +#define SVE_PARTITION_LEFT(I) SVE_PARTITION_ONE(SVE_REG(left_vec, I)) +#define SVE_PARTITION_RIGHT(I) SVE_PARTITION_ONE(SVE_REG(right_vec, I)) +#define SVE_PARTITION_TAIL(I) \ + do { \ + if (vecsToPartition > (I)) { \ + SVE_PARTITION_ONE(SVE_REG(align_vec, I)); \ + } \ + } while(false) + + // Initialize the vectors to something arbitrary which will be overwritten when + // the appropriate array elements are loaded in them + SVE_DECLARE_REG_SET(align_vec, vtype::set1(pivot)); + + // Load the align_vec vectors depending on the vecsToPartition value + SVE_UNROLL_APPLY(SVE_LOAD_TAIL); + + // Initialize the vectors to something arbitrary which will be overwritten when + // the appropriate array elements are loaded in them + left += vecsToPartition * num_lanes; + + /* Load left and right vtype::numlanes*num_unroll values into * registers to make space for in-place parition. The vec_left and - * vec_right registers are partitioned at the end */ - reg_t vec_left[num_unroll], vec_right[num_unroll]; - X86_SIMD_SORT_UNROLL_LOOP(8) - for (int ii = 0; ii < num_unroll; ++ii) { - vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii); - vec_right[ii] = - vtype::loadu(arr + (right - vtype::numlanes * (num_unroll - ii))); - } + * vec_right registers are partitioned at the end. + * Similar to the align_vec vectors, the left and right vectors + * are also initialized to an arbitrary value which will eventually be + * overwritten by array loads. */ + + SVE_DECLARE_REG_SET(left_vec, vtype::set1(pivot)); + SVE_DECLARE_REG_SET(right_vec, vtype::set1(pivot)); + + const arrsize_t right_load_start = right - UNROLL_FACTOR * num_lanes; + + SVE_UNROLL_APPLY(SVE_LOAD_LEFT); + SVE_UNROLL_APPLY(SVE_LOAD_RIGHT); + /* indices for loading the elements */ - left += num_unroll * vtype::numlanes; - right -= num_unroll * vtype::numlanes; - while (right - left != 0) { - reg_t curr_vec[num_unroll]; - /* - * if fewer elements are stored on the right side of the array, - * then next elements are loaded from the right side, - * otherwise from the left side - */ - if ((l_store + unpartitioned + vtype::numlanes) - right < - left - l_store) { - right -= num_unroll * vtype::numlanes; - X86_SIMD_SORT_UNROLL_LOOP(8) - for (int ii = 0; ii < num_unroll; ++ii) { - curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes); - /* - * error: '_mm_prefetch' needs target feature mmx on clang-cl - */ -#if !(defined(_MSC_VER) && defined(__clang__)) - _mm_prefetch((char *)(arr + right + ii * vtype::numlanes - - num_unroll * vtype::numlanes), - _MM_HINT_T0); -#endif - } + left += UNROLL_FACTOR * num_lanes; + right -= UNROLL_FACTOR * num_lanes; + + while ((right - left) != 0) { + if ((r_store + num_lanes) - right < left - l_store) { + // Load from the right side if there are fewer elements on the right + // and partition the vectors + // TODO: Explore if prefetching the next set of vectors would be beneficial here + right -= (UNROLL_FACTOR * num_lanes); + SVE_DECLARE_REG_SET_UNINIT(curr_vec); + SVE_UNROLL_APPLY(SVE_LOAD_BATCH_FROM_RIGHT); + SVE_UNROLL_APPLY(SVE_PARTITION_BATCH); } else { - X86_SIMD_SORT_UNROLL_LOOP(8) - for (int ii = 0; ii < num_unroll; ++ii) { - curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes); - /* - * error: '_mm_prefetch' needs target feature mmx on clang-cl - */ -#if !(defined(_MSC_VER) && defined(__clang__)) - _mm_prefetch((char *)(arr + left + ii * vtype::numlanes + - num_unroll * vtype::numlanes), - _MM_HINT_T0); -#endif - } - left += num_unroll * vtype::numlanes; - } - /* partition the current vector and save it on both sides of the array - * */ - X86_SIMD_SORT_UNROLL_LOOP(8) - for (int ii = 0; ii < num_unroll; ++ii) { - arrsize_t amount_ge_pivot = partition_vec( - arr + l_store, arr + l_store + unpartitioned, curr_vec[ii], - pivot_vec, min_vec, max_vec, use_gt); - l_store += (vtype::numlanes - amount_ge_pivot); - unpartitioned -= vtype::numlanes; + // Load from the left side if there are fewer elements on the left + // and partition the vectors + SVE_DECLARE_REG_SET_UNINIT(curr_vec); + SVE_UNROLL_APPLY(SVE_LOAD_BATCH_FROM_LEFT); + left += UNROLL_FACTOR * num_lanes; + SVE_UNROLL_APPLY(SVE_PARTITION_BATCH); } } - /* partition and save vec_left[num_unroll] and vec_right[num_unroll] */ - X86_SIMD_SORT_UNROLL_LOOP(8) - for (int ii = 0; ii < num_unroll; ++ii) { - arrsize_t amount_ge_pivot = - partition_vec(arr + l_store, arr + l_store + unpartitioned, - vec_left[ii], pivot_vec, min_vec, max_vec, use_gt); - l_store += (vtype::numlanes - amount_ge_pivot); - unpartitioned -= vtype::numlanes; - } - X86_SIMD_SORT_UNROLL_LOOP(8) - for (int ii = 0; ii < num_unroll; ++ii) { - arrsize_t amount_ge_pivot = - partition_vec(arr + l_store, arr + l_store + unpartitioned, - vec_right[ii], pivot_vec, min_vec, max_vec, use_gt); - l_store += (vtype::numlanes - amount_ge_pivot); - unpartitioned -= vtype::numlanes; - } - - /* partition and save vec_align[vecsToPartition] */ - X86_SIMD_SORT_UNROLL_LOOP(8) - for (int ii = 0; ii < vecsToPartition; ++ii) { - arrsize_t amount_ge_pivot = - partition_vec(arr + l_store, arr + l_store + unpartitioned, - vec_align[ii], pivot_vec, min_vec, max_vec, use_gt); - l_store += (vtype::numlanes - amount_ge_pivot); - unpartitioned -= vtype::numlanes; - } + // Partition the left and right vectors + SVE_UNROLL_APPLY(SVE_PARTITION_LEFT); + SVE_UNROLL_APPLY(SVE_PARTITION_RIGHT); + + // Partition the align_vec vectors + SVE_UNROLL_APPLY(SVE_PARTITION_TAIL); + +#undef SVE_LOAD_TAIL +#undef SVE_LOAD_LEFT +#undef SVE_LOAD_RIGHT +#undef SVE_PARTITION_LEFT +#undef SVE_PARTITION_RIGHT +#undef SVE_PARTITION_TAIL +#undef SVE_PARTITION_BATCH +#undef SVE_LOAD_BATCH_FROM_LEFT +#undef SVE_LOAD_BATCH_FROM_RIGHT +#undef SVE_PARTITION_ONE +#undef SVE_REG +#undef SVE_DECLARE_REG_SET +#undef SVE_DECLARE_REG_SET_UNINIT +#undef SVE_UNROLL_APPLY *smallest = vtype::reducemin(min_vec); - *biggest = vtype::reducemax(max_vec); + *biggest = vtype::reducemax(max_vec); return l_store; } -template -void sort_n(typename vtype::type_t *arr, int N); +template +SVE_SORT_INLINE arrsize_t sve_partition_select(type_t *arr, arrsize_t left, arrsize_t right, type_t pivot, + type_t *smallest, type_t *biggest, bool use_gt) { + if (vtype::partition_unroll_factor() == 4) { + return sve_partition_unrolled(arr, left, right, pivot, smallest, biggest, use_gt); + } else { + return sve_partition_unrolled(arr, left, right, pivot, smallest, biggest, use_gt); + } +} template -static void qsort_(type_t *arr, arrsize_t left, arrsize_t right, - arrsize_t max_iters) { - /* - * Resort to std::sort if quicksort isnt making any progress - */ - if (max_iters <= 0) { - std::sort(arr + left, arr + right + 1, comparison_func_ge); +SVE_SORT_INLINE void sve_qsort(type_t* arr, arrsize_t left, arrsize_t right, + arrsize_t max_iters) { + if ((right - left) <= OET_SORT_THRESHOLD) return; - } - /* - * Base case: use bitonic networks to sort arrays <= - * vtype::network_sort_threshold - */ - if (right + 1 - left <= vtype::network_sort_threshold) { - sort_n( - arr + left, (int32_t)(right + 1 - left)); + + if (max_iters <= 0) { + std::sort(arr + left, arr + right, sve_comparison_func_ge); return; } type_t pivot = get_pivot_blocks(arr, left, right); + type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); - arrsize_t pivot_index = - partition_avx512_unrolled( - arr, left, right + 1, pivot, &smallest, &biggest, false); + arrsize_t pivot_index = sve_partition_select(arr, left, right, + pivot, &smallest, + &biggest, false); - if (pivot != smallest) - qsort_(arr, left, pivot_index - 1, max_iters - 1); - if (pivot != biggest) qsort_(arr, pivot_index, right, max_iters - 1); + if (pivot != smallest) { + sve_qsort(arr, left, pivot_index, max_iters - 1); + } + if (pivot != biggest) { + sve_qsort(arr, pivot_index, right, max_iters - 1); + } } -// Hooks for OpenJDK sort -// to_index (exclusive) template -static int64_t vectorized_partition(type_t *arr, int64_t from_index, int64_t to_index, type_t pivot, bool use_gt) { +SVE_SORT_INLINE int64_t sve_vect_partition(type_t* arr, int64_t from_index, int64_t to_index, type_t pivot, bool use_gt) { type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); - int64_t pivot_index = partition_avx512_unrolled( - arr, from_index, to_index, pivot, &smallest, &biggest, use_gt); + int64_t pivot_index = sve_partition_select(arr, from_index, to_index, + pivot, &smallest, &biggest, use_gt); return pivot_index; } -// partitioning functions template -X86_SIMD_SORT_INLINE void simd_dual_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2){ +SVE_SORT_INLINE void sve_dual_pivot_partition(T* arr, int64_t from_index, int64_t to_index, + int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2){ const T pivot1 = arr[index_pivot1]; const T pivot2 = arr[index_pivot2]; @@ -431,12 +439,10 @@ X86_SIMD_SORT_INLINE void simd_dual_pivot_partition(T *arr, int64_t from_index, const int64_t start = low + 1; const int64_t end = high - 1; - std::swap(arr[index_pivot1], arr[low]); std::swap(arr[index_pivot2], arr[end]); - - const int64_t pivot_index2 = vectorized_partition(arr, start, end, pivot2, true); // use_gt = true + const int64_t pivot_index2 = sve_vect_partition(arr, start, end, pivot2, true); // use_gt = true std::swap(arr[end], arr[pivot_index2]); int64_t upper = pivot_index2; @@ -447,7 +453,7 @@ X86_SIMD_SORT_INLINE void simd_dual_pivot_partition(T *arr, int64_t from_index, return; } - const int64_t pivot_index1 = vectorized_partition(arr, start, upper, pivot1, false); // use_ge (use_gt = false) + const int64_t pivot_index1 = sve_vect_partition(arr, start, upper, pivot1, false); // use_ge (use_gt = false) int64_t lower = pivot_index1 - 1; std::swap(arr[low], arr[lower]); @@ -456,7 +462,8 @@ X86_SIMD_SORT_INLINE void simd_dual_pivot_partition(T *arr, int64_t from_index, } template -X86_SIMD_SORT_INLINE void simd_single_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot) { +SVE_SORT_INLINE void sve_single_pivot_partition(T* arr, int64_t from_index, int64_t to_index, + int32_t *pivot_indices, int64_t index_pivot) { const T pivot = arr[index_pivot]; const int64_t low = from_index; @@ -464,28 +471,18 @@ X86_SIMD_SORT_INLINE void simd_single_pivot_partition(T *arr, int64_t from_index const int64_t end = high - 1; - const int64_t pivot_index1 = vectorized_partition(arr, low, high, pivot, false); // use_gt = false (use_ge) + const int64_t pivot_index1 = sve_vect_partition(arr, low, high, pivot, false); // use_gt = false (use_ge) int64_t lower = pivot_index1; - const int64_t pivot_index2 = vectorized_partition(arr, pivot_index1, high, pivot, true); // use_gt = true + const int64_t pivot_index2 = sve_vect_partition(arr, pivot_index1, high, pivot, true); // use_gt = true int64_t upper = pivot_index2; pivot_indices[0] = lower; pivot_indices[1] = upper; } -template -X86_SIMD_SORT_INLINE void simd_fast_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2) { - if (index_pivot1 != index_pivot2) { - simd_dual_pivot_partition(arr, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); - } - else { - simd_single_pivot_partition(arr, from_index, to_index, pivot_indices, index_pivot1); - } -} - template -X86_SIMD_SORT_INLINE void insertion_sort(T *arr, int32_t from_index, int32_t to_index) { +SVE_SORT_INLINE void insertion_sort(T* arr, int32_t from_index, int32_t to_index) { for (int i, k = from_index; ++k < to_index; ) { T ai = arr[i = k]; if (ai < arr[i - 1]) { @@ -497,32 +494,25 @@ X86_SIMD_SORT_INLINE void insertion_sort(T *arr, int32_t from_index, int32_t to_ } } -template -X86_SIMD_SORT_INLINE void simd_fast_sort(T *arr, arrsize_t from_index, arrsize_t to_index, const arrsize_t INS_SORT_THRESHOLD) -{ +template +SVE_SORT_INLINE void sve_fast_sort(T* arr, arrsize_t from_index, arrsize_t to_index, const arrsize_t INS_SORT_THRESHOLD) { arrsize_t arrsize = to_index - from_index; + if (arrsize <= INS_SORT_THRESHOLD) { insertion_sort(arr, from_index, to_index); } else { - qsort_(arr, from_index, to_index - 1, 2 * (arrsize_t)log2(arrsize)); + sve_qsort, T>(arr, from_index, to_index, 2 * (arrsize_t) (63 - __builtin_clzll((unsigned long long) arrsize))); + sve_oet_sort, T>(arr, from_index, to_index); } } -#define DEFINE_METHODS(ISA, VTYPE) \ - template \ - X86_SIMD_SORT_INLINE void ISA##_fast_sort( \ - T *arr, arrsize_t from_index, arrsize_t to_index, const arrsize_t INS_SORT_THRESHOLD) \ - { \ - simd_fast_sort(arr, from_index, to_index, INS_SORT_THRESHOLD); \ - } \ - template \ - X86_SIMD_SORT_INLINE void ISA##_fast_partition( \ - T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2) \ - { \ - simd_fast_partition(arr, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); \ +template +SVE_SORT_INLINE void sve_fast_partition(T* arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2) { + if (index_pivot1 != index_pivot2) { + sve_dual_pivot_partition, T>(arr, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); } - -DEFINE_METHODS(avx2, avx2_vector) -DEFINE_METHODS(avx512, zmm_vector) - -#endif // XSS_COMMON_QSORT + else { + sve_single_pivot_partition, T>(arr, from_index, to_index, pivot_indices, index_pivot1); + } +} +#endif // AARCH64_SVE_COMMON_QSORT_HPP diff --git a/src/java.base/linux/native/libsimdsort/aarch64/sve-config.hpp b/src/java.base/linux/native/libsimdsort/aarch64/sve-config.hpp new file mode 100644 index 0000000000000..86a7217ca71f1 --- /dev/null +++ b/src/java.base/linux/native/libsimdsort/aarch64/sve-config.hpp @@ -0,0 +1,55 @@ +/* + * Copyright 2025 Arm Limited and/or its affiliates. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef AARCH64_SVE_CONFIG_HPP +#define AARCH64_SVE_CONFIG_HPP + +#include +#include +#include +#include "simdsort-support.hpp" + +#define SIMD_SORT_INFINITYF std::numeric_limits::infinity() +#define SIMD_SORT_MAX_INT32 std::numeric_limits::max() +#define SIMD_SORT_MIN_INT32 std::numeric_limits::min() + +#if defined(__GNUC__) + #define SVE_SORT_INLINE static inline + #define SVE_SORT_FINLINE static inline __attribute__((always_inline)) +#else + #define SVE_SORT_INLINE static + #define SVE_SORT_FINLINE static +#endif + +#ifndef DLL_PUBLIC + #define DLL_PUBLIC __attribute__((visibility("default"))) +#endif + +using arrsize_t = std::size_t; + +#ifndef OET_SORT_THRESHOLD + #define OET_SORT_THRESHOLD 8 +#endif + +#endif // AARCH64_SVE_CONFIG_HPP diff --git a/src/java.base/linux/native/libsimdsort/aarch64/sve-linux-qsort.cpp b/src/java.base/linux/native/libsimdsort/aarch64/sve-linux-qsort.cpp index 628d65077c701..9b6d3d2f52e4e 100644 --- a/src/java.base/linux/native/libsimdsort/aarch64/sve-linux-qsort.cpp +++ b/src/java.base/linux/native/libsimdsort/aarch64/sve-linux-qsort.cpp @@ -1,5 +1,6 @@ /* * Copyright (c) 2023 Intel Corporation. All rights reserved. + * Copyright 2025 Arm Limited and/or its affiliates. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,45 +23,41 @@ * */ -#include "simdsort-support.hpp" -#ifdef __SIMDSORT_SUPPORTED_LINUX - -#pragma GCC target("avx2") -#include "avx2-32bit-qsort.hpp" +#include "sve-config.hpp" +#include "sve-common-qsort.hpp" #include "classfile_constants.h" - - -#define DLL_PUBLIC __attribute__((visibility("default"))) -#define INSERTION_SORT_THRESHOLD_32BIT 16 +#include "simdsort-support.hpp" +#include extern "C" { - DLL_PUBLIC void avx2_sort(void *array, int elem_type, int32_t from_index, int32_t to_index) { + DLL_PUBLIC void sve_sort(void *array, int elem_type, int32_t from_index, int32_t to_index) { switch(elem_type) { case JVM_T_INT: - avx2_fast_sort((int32_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT); + sve_fast_sort((int32_t*)array, from_index, to_index, 64); break; case JVM_T_FLOAT: - avx2_fast_sort((float*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT); + sve_fast_sort((float*)array, from_index, to_index, 64); break; + case JVM_T_LONG: + case JVM_T_DOUBLE: default: assert(false, "Unexpected type"); } } - DLL_PUBLIC void avx2_partition(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) { + DLL_PUBLIC void sve_partition(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) { switch(elem_type) { case JVM_T_INT: - avx2_fast_partition((int32_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + sve_fast_partition((int32_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); break; case JVM_T_FLOAT: - avx2_fast_partition((float*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); + sve_fast_partition((float*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2); break; + case JVM_T_LONG: + case JVM_T_DOUBLE: default: assert(false, "Unexpected type"); } } - } - -#endif \ No newline at end of file diff --git a/src/java.base/linux/native/libsimdsort/aarch64/sve-oet-sort.hpp b/src/java.base/linux/native/libsimdsort/aarch64/sve-oet-sort.hpp new file mode 100644 index 0000000000000..61f24bc01a532 --- /dev/null +++ b/src/java.base/linux/native/libsimdsort/aarch64/sve-oet-sort.hpp @@ -0,0 +1,52 @@ +/* + * Copyright 2025 Arm Limited and/or its affiliates. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef AARCH64_SVE_OET_SORT_HPP +#define AARCH64_SVE_OET_SORT_HPP + +#include "sve-config.hpp" +#include "sve-qsort.hpp" + +template +SVE_SORT_INLINE void sve_oet_sort(type_t* arr, arrsize_t from_index, arrsize_t to_index) { + arrsize_t arr_num = to_index - from_index; + const uint8_t numLanes = vtype::numlanes(); + + for (int32_t i = 0; i < OET_SORT_THRESHOLD; i++) { + // Odd-even pass: even i -> j starts at from_index + // odd i -> j starts at from_index + 1 + int32_t j = from_index + i % 2; + int32_t remaining = arr_num - (i % 2); + + while (remaining >= 2) { + const int32_t vals_per_iteration = (remaining < (2 * numLanes)) ? remaining : 2 * numLanes; + const int32_t num = vals_per_iteration / 2; + vtype::oet_sort(&arr[j], num); + + j += vals_per_iteration; + remaining -= vals_per_iteration; + } + } +} +#endif // AARCH64_SVE_OET_SORT_HPP diff --git a/src/java.base/linux/native/libsimdsort/aarch64/sve-qsort.hpp b/src/java.base/linux/native/libsimdsort/aarch64/sve-qsort.hpp index 9310b0098d808..24b5f1aaf8033 100644 --- a/src/java.base/linux/native/libsimdsort/aarch64/sve-qsort.hpp +++ b/src/java.base/linux/native/libsimdsort/aarch64/sve-qsort.hpp @@ -1,6 +1,7 @@ /* * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. * Copyright (c) 2021 Serge Sans Paille. All rights reserved. + * Copyright 2025 Arm Limited and/or its affiliates. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,345 +24,219 @@ * */ -// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort) +#ifndef SVE_QSORT_VECTOR +#define SVE_QSORT_VECTOR -#ifndef AVX2_QSORT_32BIT -#define AVX2_QSORT_32BIT +#include +#include +#include -#include "avx2-emu-funcs.hpp" -#include "xss-common-qsort.h" - -/* - * Constants used in sorting 8 elements in a ymm registers. Based on Bitonic - * sorting network (see - * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) - */ - -// ymm 7, 6, 5, 4, 3, 2, 1, 0 -#define NETWORK_32BIT_AVX2_1 4, 5, 6, 7, 0, 1, 2, 3 -#define NETWORK_32BIT_AVX2_2 0, 1, 2, 3, 4, 5, 6, 7 -#define NETWORK_32BIT_AVX2_3 5, 4, 7, 6, 1, 0, 3, 2 -#define NETWORK_32BIT_AVX2_4 3, 2, 1, 0, 7, 6, 5, 4 - -/* - * Assumes ymm is random and performs a full sorting network defined in - * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg - */ -template -X86_SIMD_SORT_INLINE reg_t sort_ymm_32bit(reg_t ymm) { - const typename vtype::opmask_t oxAA = _mm256_set_epi32( - 0xFFFFFFFF, 0, 0xFFFFFFFF, 0, 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); - const typename vtype::opmask_t oxCC = _mm256_set_epi32( - 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0); - const typename vtype::opmask_t oxF0 = _mm256_set_epi32( - 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0, 0); - - const typename vtype::ymmi_t rev_index = vtype::seti(NETWORK_32BIT_AVX2_2); - ymm = cmp_merge( - ymm, vtype::template shuffle(ymm), oxAA); - ymm = cmp_merge( - ymm, vtype::permutexvar(vtype::seti(NETWORK_32BIT_AVX2_1), ymm), oxCC); - ymm = cmp_merge( - ymm, vtype::template shuffle(ymm), oxAA); - ymm = cmp_merge(ymm, vtype::permutexvar(rev_index, ymm), oxF0); - ymm = cmp_merge( - ymm, vtype::permutexvar(vtype::seti(NETWORK_32BIT_AVX2_3), ymm), oxCC); - ymm = cmp_merge( - ymm, vtype::template shuffle(ymm), oxAA); - return ymm; -} - -struct avx2_32bit_swizzle_ops; +template +struct sve_vector; template <> -struct avx2_vector { +struct sve_vector { using type_t = int32_t; - using reg_t = __m256i; - using ymmi_t = __m256i; - using opmask_t = __m256i; - static const uint8_t numlanes = 8; -#ifdef XSS_MINIMAL_NETWORK_SORT - static constexpr int network_sort_threshold = numlanes; -#else - static constexpr int network_sort_threshold = 256; -#endif - static constexpr int partition_unroll_factor = 4; - - using swizzle_ops = avx2_32bit_swizzle_ops; - - static type_t type_max() { return X86_SIMD_SORT_MAX_INT32; } - static type_t type_min() { return X86_SIMD_SORT_MIN_INT32; } - static reg_t zmm_max() { - return _mm256_set1_epi32(type_max()); - } // TODO: this should broadcast bits as is? - static opmask_t get_partial_loadmask(uint64_t num_to_read) { - auto mask = ((0x1ull << num_to_read) - 0x1ull); - return convert_int_to_avx2_mask(mask); + using reg_t = svint32_t; // SVE 32-bit integer vector + using opmask_t = svbool_t; // predicate register + /* TODO: Prefer avoiding a runtime svcntw() call when the vector length + * is known at compile time. One option is to add a template parameter to + * this struct for common cases - 128/256 bits with a fallback to svcntw() + * if the vector width is unknown at compile time. + */ + static inline uint8_t numlanes() { + return static_cast(svcntw()); } - static ymmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, - int v8) { - return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); + + static inline int partition_unroll_factor() { + return (svcntw() * sizeof(type_t)) > 16 ? 4 : 2; } - static opmask_t kxor_opmask(opmask_t x, opmask_t y) { - return _mm256_xor_si256(x, y); + + static type_t type_max() { return SIMD_SORT_MAX_INT32; } + static type_t type_min() { return SIMD_SORT_MIN_INT32; } + + static opmask_t knot_opmask(opmask_t x) { + return svnot_b_z(svptrue_b32(), x); } + static opmask_t ge(reg_t x, reg_t y) { - opmask_t equal = eq(x, y); - opmask_t greater = _mm256_cmpgt_epi32(x, y); - return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(equal), - _mm256_castsi256_ps(greater))); + return svcmpge_s32(svptrue_b32(),x, y); } - static opmask_t gt(reg_t x, reg_t y) { return _mm256_cmpgt_epi32(x, y); } - static opmask_t eq(reg_t x, reg_t y) { return _mm256_cmpeq_epi32(x, y); } - template - static reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, - void const *base) { - return _mm256_mask_i32gather_epi32(src, base, index, mask, scale); - } - template - static reg_t i64gather(__m256i index, void const *base) { - return _mm256_i32gather_epi32((int const *)base, index, scale); + + static opmask_t gt(reg_t x, reg_t y) { + return svcmpgt_s32(svptrue_b32(),x, y); } + static reg_t loadu(void const *mem) { - return _mm256_loadu_si256((reg_t const *)mem); - } - static reg_t max(reg_t x, reg_t y) { return _mm256_max_epi32(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { - return avx2_emu_mask_compressstoreu32(mem, mask, x); - } - static reg_t maskz_loadu(opmask_t mask, void const *mem) { - return _mm256_maskload_epi32((const int *)mem, mask); - } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { - reg_t dst = _mm256_maskload_epi32((type_t *)mem, mask); - return mask_mov(x, mask, dst); - } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { - return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(x), - _mm256_castsi256_ps(y), - _mm256_castsi256_ps(mask))); - } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) { - return _mm256_maskstore_epi32((type_t *)mem, mask, x); - } - static reg_t min(reg_t x, reg_t y) { return _mm256_min_epi32(x, y); } - static reg_t permutexvar(__m256i idx, reg_t ymm) { - return _mm256_permutevar8x32_epi32(ymm, idx); - // return avx2_emu_permutexvar_epi32(idx, ymm); - } - static reg_t permutevar(reg_t ymm, __m256i idx) { - return _mm256_permutevar8x32_epi32(ymm, idx); - } - static reg_t reverse(reg_t ymm) { - const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2); - return permutexvar(rev_index, ymm); + return svld1_s32(svptrue_b32(), (const int32_t*)mem); } + static type_t reducemax(reg_t v) { - return avx2_emu_reduce_max32(v); + return svmaxv_s32(svptrue_b32(), v); } + static type_t reducemin(reg_t v) { - return avx2_emu_reduce_min32(v); + return svminv_s32(svptrue_b32(), v); } - static reg_t set1(type_t v) { return _mm256_set1_epi32(v); } - template - static reg_t shuffle(reg_t ymm) { - return _mm256_shuffle_epi32(ymm, mask); + + static reg_t set1(type_t v) { + return svdup_n_s32(v); } + static void storeu(void *mem, reg_t x) { - _mm256_storeu_si256((__m256i *)mem, x); + return svst1_s32(svptrue_b32(), (int32_t*)mem, x); } - static reg_t sort_vec(reg_t x) { - return sort_ymm_32bit>(x); + + static reg_t min(reg_t x, reg_t y) { + return svmin_s32_z(svptrue_b32(), x, y); } - static reg_t cast_from(__m256i v) { return v; } - static __m256i cast_to(reg_t v) { return v; } + + static reg_t max(reg_t x, reg_t y) { + return svmax_s32_z(svptrue_b32(), x, y); + } + static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) { - return avx2_double_compressstore32(left_addr, right_addr, k, - reg); + // fast path if all vector elements are less than pivot + svbool_t pg = svptrue_b32(); + if (!svptest_any(pg, k)) { + svst1_s32(pg, (int32_t*)left_addr, reg); + return 0; + } + + // fast path if all vector elements are greater than pivot + if (!svptest_any(pg, svnot_b_z(pg, k))) { + svst1_s32(pg, (int32_t*)right_addr, reg); + return numlanes(); + } + + uint64_t amount_ge_pivot = svcntp_b32(svptrue_b32(), k); + uint64_t amount_nge_pivot = numlanes() - amount_ge_pivot; + + svint32_t compressed_1 = svcompact_s32(knot_opmask(k), reg); + svint32_t compressed_2 = svcompact_s32(k, reg); + + svbool_t store_mask_1 = svwhilelt_b32_u64(0, amount_nge_pivot); + svbool_t store_mask_2 = svwhilelt_b32_u64(0, amount_ge_pivot); + + svst1_s32(store_mask_1, (int32_t*)left_addr, compressed_1); + svst1_s32(store_mask_2, (int32_t*)(right_addr + amount_nge_pivot), compressed_2); + + return amount_ge_pivot; + } + + static void oet_sort(type_t *arr, arrsize_t num) { + svbool_t p1 = svwhilelt_b32_u64(0, num); + const svint32x2_t z0_z1 = svld2_s32(p1, arr); + const svbool_t p2 = svcmplt_s32(p1, svget2_s32(z0_z1, 0), svget2_s32(z0_z1, 1)); + + const svint32_t z4 = svsel_s32(p2, svget2_s32(z0_z1, 0), svget2_s32(z0_z1, 1)); // z4 <- smaller values + const svint32_t z5 = svsel_s32(p2, svget2_s32(z0_z1, 1), svget2_s32(z0_z1, 0)); // z5 <- larger values + + svst2_s32(p1, arr, svcreate2_s32(z4, z5)); } }; template <> -struct avx2_vector { +struct sve_vector { using type_t = float; - using reg_t = __m256; - using ymmi_t = __m256i; - using opmask_t = __m256i; - static const uint8_t numlanes = 8; -#ifdef XSS_MINIMAL_NETWORK_SORT - static constexpr int network_sort_threshold = numlanes; -#else - static constexpr int network_sort_threshold = 256; -#endif - static constexpr int partition_unroll_factor = 4; - - using swizzle_ops = avx2_32bit_swizzle_ops; - - static type_t type_max() { return X86_SIMD_SORT_INFINITYF; } - static type_t type_min() { return -X86_SIMD_SORT_INFINITYF; } - static reg_t zmm_max() { return _mm256_set1_ps(type_max()); } - - static ymmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, - int v8) { - return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); + using reg_t = svfloat32_t; // SVE 32-bit float vector + using opmask_t = svbool_t; // predicate register + /* TODO: Prefer avoiding a runtime svcntw() call when the vector length + * is known at compile time. One option is to add a template parameter to + * this struct for common cases - 128/256 bits with a fallback to svcntw() + * if the vector width is unknown at compile time. + */ + static inline uint8_t numlanes() { + return static_cast(svcntw()); } - static reg_t maskz_loadu(opmask_t mask, void const *mem) { - return _mm256_maskload_ps((const float *)mem, mask); + static inline int partition_unroll_factor() { + return (svcntw() * sizeof(type_t)) > 16 ? 4 : 2; + } + + static type_t type_max() { return SIMD_SORT_INFINITYF; } + static type_t type_min() { return -SIMD_SORT_INFINITYF; } + + static opmask_t knot_opmask(opmask_t x) { + return svnot_b_z(svptrue_b32(), x); } + static opmask_t ge(reg_t x, reg_t y) { - return _mm256_castps_si256(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); + return svcmpge_f32(svptrue_b32(),x, y); } + static opmask_t gt(reg_t x, reg_t y) { - return _mm256_castps_si256(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); - } - static opmask_t eq(reg_t x, reg_t y) { - return _mm256_castps_si256(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); - } - static opmask_t get_partial_loadmask(uint64_t num_to_read) { - auto mask = ((0x1ull << num_to_read) - 0x1ull); - return convert_int_to_avx2_mask(mask); - } - static int32_t convert_mask_to_int(opmask_t mask) { - return convert_avx2_mask_to_int(mask); - } - template - static opmask_t fpclass(reg_t x) { - if constexpr (type == (0x01 | 0x80)) { - return _mm256_castps_si256(_mm256_cmp_ps(x, x, _CMP_UNORD_Q)); - } else { - static_assert(type == (0x01 | 0x80), "should not reach here"); - } - } - template - static reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, - void const *base) { - return _mm256_mask_i32gather_ps(src, base, index, - _mm256_castsi256_ps(mask), scale); - ; - } - template - static reg_t i64gather(__m256i index, void const *base) { - return _mm256_i32gather_ps((float *)base, index, scale); + return svcmpgt_f32(svptrue_b32(),x, y); } + static reg_t loadu(void const *mem) { - return _mm256_loadu_ps((float const *)mem); - } - static reg_t max(reg_t x, reg_t y) { return _mm256_max_ps(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { - return avx2_emu_mask_compressstoreu32(mem, mask, x); - } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { - reg_t dst = _mm256_maskload_ps((type_t *)mem, mask); - return mask_mov(x, mask, dst); - } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { - return _mm256_blendv_ps(x, y, _mm256_castsi256_ps(mask)); - } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) { - return _mm256_maskstore_ps((type_t *)mem, mask, x); - } - static reg_t min(reg_t x, reg_t y) { return _mm256_min_ps(x, y); } - static reg_t permutexvar(__m256i idx, reg_t ymm) { - return _mm256_permutevar8x32_ps(ymm, idx); - } - static reg_t permutevar(reg_t ymm, __m256i idx) { - return _mm256_permutevar8x32_ps(ymm, idx); - } - static reg_t reverse(reg_t ymm) { - const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2); - return permutexvar(rev_index, ymm); + return svld1_f32(svptrue_b32(), (const float*)mem); } + static type_t reducemax(reg_t v) { - return avx2_emu_reduce_max32(v); + return svmaxv_f32(svptrue_b32(), v); } + static type_t reducemin(reg_t v) { - return avx2_emu_reduce_min32(v); + return svminv_f32(svptrue_b32(), v); } - static reg_t set1(type_t v) { return _mm256_set1_ps(v); } - template - static reg_t shuffle(reg_t ymm) { - return _mm256_castsi256_ps( - _mm256_shuffle_epi32(_mm256_castps_si256(ymm), mask)); + + static reg_t set1(type_t v) { + return svdup_n_f32(v); } + static void storeu(void *mem, reg_t x) { - _mm256_storeu_ps((float *)mem, x); + return svst1_f32(svptrue_b32(), (float32_t*)mem, x); } - static reg_t sort_vec(reg_t x) { - return sort_ymm_32bit>(x); + + static reg_t min(reg_t x, reg_t y) { + return svmin_f32_z(svptrue_b32(), x, y); } - static reg_t cast_from(__m256i v) { return _mm256_castsi256_ps(v); } - static __m256i cast_to(reg_t v) { return _mm256_castps_si256(v); } + + static reg_t max(reg_t x, reg_t y) { + return svmax_f32_z(svptrue_b32(), x, y); + } + static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) { - return avx2_double_compressstore32(left_addr, right_addr, k, - reg); - } -}; + // fast path if all vector elements are less than pivot + svbool_t pg = svptrue_b32(); + if (!svptest_any(pg, k)) { + svst1_f32(pg, (float32_t*)left_addr, reg); + return 0; + } -struct avx2_32bit_swizzle_ops { - template - X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n( - typename vtype::reg_t reg) { - __m256i v = vtype::cast_to(reg); - - if constexpr (scale == 2) { - __m256 vf = _mm256_castsi256_ps(v); - vf = _mm256_permute_ps(vf, 0b10110001); - v = _mm256_castps_si256(vf); - } else if constexpr (scale == 4) { - __m256 vf = _mm256_castsi256_ps(v); - vf = _mm256_permute_ps(vf, 0b01001110); - v = _mm256_castps_si256(vf); - } else if constexpr (scale == 8) { - v = _mm256_permute2x128_si256(v, v, 0b00000001); - } else { - static_assert(scale == -1, "should not be reached"); + // fast path if all vector elements are greater than pivot + if (!svptest_any(pg, svnot_b_z(pg, k))) { + svst1_f32(pg, (float32_t*)right_addr, reg); + return numlanes(); } - return vtype::cast_from(v); - } + uint64_t amount_ge_pivot = svcntp_b32(svptrue_b32(), k); + uint64_t amount_nge_pivot = numlanes() - amount_ge_pivot; - template - X86_SIMD_SORT_INLINE typename vtype::reg_t reverse_n( - typename vtype::reg_t reg) { - __m256i v = vtype::cast_to(reg); - - if constexpr (scale == 2) { - return swap_n(reg); - } else if constexpr (scale == 4) { - constexpr uint64_t mask = 0b00011011; - __m256 vf = _mm256_castsi256_ps(v); - vf = _mm256_permute_ps(vf, mask); - v = _mm256_castps_si256(vf); - } else if constexpr (scale == 8) { - return vtype::reverse(reg); - } else { - static_assert(scale == -1, "should not be reached"); - } + svfloat32_t compressed_1 = svcompact_f32(knot_opmask(k), reg); + svfloat32_t compressed_2 = svcompact_f32(k, reg); - return vtype::cast_from(v); + svbool_t store_mask_1 = svwhilelt_b32_u64(0, amount_nge_pivot); + svbool_t store_mask_2 = svwhilelt_b32_u64(0, amount_ge_pivot); + + svst1_f32(store_mask_1, (float32_t*)left_addr, compressed_1); + svst1_f32(store_mask_2, (float32_t*)(right_addr + amount_nge_pivot), compressed_2); + + return amount_ge_pivot; } - template - X86_SIMD_SORT_INLINE typename vtype::reg_t merge_n( - typename vtype::reg_t reg, typename vtype::reg_t other) { - __m256i v1 = vtype::cast_to(reg); - __m256i v2 = vtype::cast_to(other); - - if constexpr (scale == 2) { - v1 = _mm256_blend_epi32(v1, v2, 0b01010101); - } else if constexpr (scale == 4) { - v1 = _mm256_blend_epi32(v1, v2, 0b00110011); - } else if constexpr (scale == 8) { - v1 = _mm256_blend_epi32(v1, v2, 0b00001111); - } else { - static_assert(scale == -1, "should not be reached"); - } + static void oet_sort(type_t *arr, arrsize_t num) { + svbool_t p1 = svwhilelt_b32_u64(0, num); + const svfloat32x2_t z0_z1 = svld2_f32(p1, arr); + const svbool_t p2 = svcmplt_f32(p1, svget2_f32(z0_z1, 0), svget2_f32(z0_z1, 1)); - return vtype::cast_from(v1); + const svfloat32_t z4 = svsel_f32(p2, svget2_f32(z0_z1, 0), svget2_f32(z0_z1, 1)); // z4 <- smaller values + const svfloat32_t z5 = svsel_f32(p2, svget2_f32(z0_z1, 1), svget2_f32(z0_z1, 0)); // z5 <- larger values + + svst2_f32(p1, arr, svcreate2_f32(z4, z5)); } }; - -#endif // AVX2_QSORT_32BIT +#endif // SVE_QSORT_VECTOR