Skip to content

Commit 513b1f1

Browse files
AArch64 SVE implementation for Arrays.sort
This patch adds an SVE implementation of primitive array sorting (Arrays.sort()) on AArch64 systems that support SVE. On non-SVE machines, we fall back to the existing Java implementation. For smaller arrays (length <= 64), we use insertion sort; for larger arrays we use an SVE-vectorized quicksort partitioner followed by an odd-even transposition cleanup pass. The SVE path is enabled by default for int type. For float type, it is available through the experimental flag : -XX:+UnlockExperimentalVMOptions -XX:+UseSVELibSimdSortForFP Without this flag being enabled, the default Java implementation would be executed for floats (the flag is disabled by default). Float is gated due to observed regressions on some small/medium sizes. On larger arrays, the SVE float path shows upto 1.47x speedup on Neoverse V2 and 2.12x on Neoverse V1. Following are the performance numbers for ArraysSort JMH benchmark - Case A: Ratio between the scores of master branch and UseSVELibSimdSortForFP flag disabled (which is the default). Case B: Ratio between the scores of master branch and UseSVELibSimdSortForFP flag enabled (the int numbers will be the same but this now enables SVE vectorized sorting for floats). We would want the ratios to be >= 1 to be at par or better than the default Java implementation (master branch). On Neoverse V1: Benchmark (size) Mode Cnt A B ArraysSort.floatParallelSort 10 avgt 3 0.98 0.98 ArraysSort.floatParallelSort 25 avgt 3 1.01 0.83 ArraysSort.floatParallelSort 50 avgt 3 0.99 0.55 ArraysSort.floatParallelSort 75 avgt 3 0.99 0.66 ArraysSort.floatParallelSort 100 avgt 3 0.98 0.66 ArraysSort.floatParallelSort 1000 avgt 3 1.00 0.84 ArraysSort.floatParallelSort 10000 avgt 3 1.03 1.52 ArraysSort.floatParallelSort 100000 avgt 3 1.03 1.46 ArraysSort.floatParallelSort 1000000 avgt 3 0.98 1.81 ArraysSort.floatSort 10 avgt 3 1.00 0.98 ArraysSort.floatSort 25 avgt 3 1.00 0.81 ArraysSort.floatSort 50 avgt 3 0.99 0.56 ArraysSort.floatSort 75 avgt 3 0.99 0.65 ArraysSort.floatSort 100 avgt 3 0.98 0.70 ArraysSort.floatSort 1000 avgt 3 0.99 0.84 ArraysSort.floatSort 10000 avgt 3 0.99 1.72 ArraysSort.floatSort 100000 avgt 3 1.00 1.94 ArraysSort.floatSort 1000000 avgt 3 1.00 2.13 ArraysSort.intParallelSort 10 avgt 3 1.08 1.08 ArraysSort.intParallelSort 25 avgt 3 1.04 1.05 ArraysSort.intParallelSort 50 avgt 3 1.29 1.30 ArraysSort.intParallelSort 75 avgt 3 1.16 1.16 ArraysSort.intParallelSort 100 avgt 3 1.07 1.07 ArraysSort.intParallelSort 1000 avgt 3 1.13 1.13 ArraysSort.intParallelSort 10000 avgt 3 1.49 1.38 ArraysSort.intParallelSort 100000 avgt 3 1.64 1.62 ArraysSort.intParallelSort 1000000 avgt 3 2.26 2.27 ArraysSort.intSort 10 avgt 3 1.08 1.08 ArraysSort.intSort 25 avgt 3 1.02 1.02 ArraysSort.intSort 50 avgt 3 1.25 1.25 ArraysSort.intSort 75 avgt 3 1.16 1.20 ArraysSort.intSort 100 avgt 3 1.07 1.07 ArraysSort.intSort 1000 avgt 3 1.12 1.13 ArraysSort.intSort 10000 avgt 3 1.94 1.95 ArraysSort.intSort 100000 avgt 3 1.86 1.86 ArraysSort.intSort 1000000 avgt 3 2.09 2.09 On Neoverse V2: Benchmark (size) Mode Cnt A B ArraysSort.floatParallelSort 10 avgt 3 1.02 1.02 ArraysSort.floatParallelSort 25 avgt 3 0.97 0.71 ArraysSort.floatParallelSort 50 avgt 3 0.94 0.65 ArraysSort.floatParallelSort 75 avgt 3 0.96 0.82 ArraysSort.floatParallelSort 100 avgt 3 0.95 0.84 ArraysSort.floatParallelSort 1000 avgt 3 1.01 0.94 ArraysSort.floatParallelSort 10000 avgt 3 1.01 1.25 ArraysSort.floatParallelSort 100000 avgt 3 1.01 1.09 ArraysSort.floatParallelSort 1000000 avgt 3 1.00 1.10 ArraysSort.floatSort 10 avgt 3 1.02 1.00 ArraysSort.floatSort 25 avgt 3 0.99 0.76 ArraysSort.floatSort 50 avgt 3 0.97 0.66 ArraysSort.floatSort 75 avgt 3 1.01 0.83 ArraysSort.floatSort 100 avgt 3 1.00 0.85 ArraysSort.floatSort 1000 avgt 3 0.99 0.93 ArraysSort.floatSort 10000 avgt 3 1.00 1.28 ArraysSort.floatSort 100000 avgt 3 1.00 1.37 ArraysSort.floatSort 1000000 avgt 3 1.00 1.48 ArraysSort.intParallelSort 10 avgt 3 1.05 1.05 ArraysSort.intParallelSort 25 avgt 3 0.99 0.84 ArraysSort.intParallelSort 50 avgt 3 1.03 1.14 ArraysSort.intParallelSort 75 avgt 3 0.91 0.99 ArraysSort.intParallelSort 100 avgt 3 0.98 0.96 ArraysSort.intParallelSort 1000 avgt 3 1.32 1.30 ArraysSort.intParallelSort 10000 avgt 3 1.40 1.40 ArraysSort.intParallelSort 100000 avgt 3 1.00 1.04 ArraysSort.intParallelSort 1000000 avgt 3 1.15 1.14 ArraysSort.intSort 10 avgt 3 1.05 1.05 ArraysSort.intSort 25 avgt 3 1.03 1.03 ArraysSort.intSort 50 avgt 3 1.08 1.14 ArraysSort.intSort 75 avgt 3 0.88 0.98 ArraysSort.intSort 100 avgt 3 1.01 0.99 ArraysSort.intSort 1000 avgt 3 1.3 1.32 ArraysSort.intSort 10000 avgt 3 1.43 1.43 ArraysSort.intSort 100000 avgt 3 1.30 1.30 ArraysSort.intSort 1000000 avgt 3 1.37 1.37
1 parent 901445f commit 513b1f1

File tree

14 files changed

+686
-694
lines changed

14 files changed

+686
-694
lines changed

make/modules/java.base/Lib.gmk

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@ ifeq ($(ENABLE_FALLBACK_LINKER), true)
187187
TARGETS += $(BUILD_LIBFALLBACKLINKER)
188188
endif
189189

190+
SIMDSORT_BASE_DIR := $(TOPDIR)/src/java.base/linux/native/libsimdsort
190191
ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)+$(filter $(TOOLCHAIN_TYPE), gcc), true+true+true+gcc)
191192
##############################################################################
192193
## Build libsimdsort
@@ -196,6 +197,7 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)
196197
NAME := simdsort, \
197198
LINK_TYPE := C++, \
198199
OPTIMIZATION := HIGH, \
200+
SRC := $(SIMDSORT_BASE_DIR)/x86, \
199201
CXXFLAGS := -std=c++17, \
200202
DISABLED_WARNINGS_gcc := unused-variable, \
201203
LIBS_linux := $(LIBM), \
@@ -204,4 +206,21 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)
204206
TARGETS += $(BUILD_LIBSIMD_SORT)
205207
endif
206208

209+
ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, aarch64)+$(INCLUDE_COMPILER2)+$(filter $(TOOLCHAIN_TYPE), gcc), true+true+true+gcc)
210+
$(eval $(call SetupJdkLibrary, BUILD_LIBSIMD_SORT, \
211+
NAME := simdsort, \
212+
TOOLCHAIN := TOOLCHAIN_LINK_CXX, \
213+
OPTIMIZATION := HIGH, \
214+
SRC := $(SIMDSORT_BASE_DIR)/aarch64, \
215+
CFLAGS := $(CFLAGS_JDKLIB) -march=armv8.2-a+sve, \
216+
CXXFLAGS := $(CXXFLAGS_JDKLIB) -march=armv8.2-a+sve -std=c++17, \
217+
LDFLAGS := $(LDFLAGS_JDKLIB) \
218+
$(call SET_SHARED_LIBRARY_ORIGIN), \
219+
LIBS := $(LIBCXX), \
220+
DISABLED_WARNINGS_gcc := unused-variable, \
221+
LIBS_linux := -lc -lm -ldl, \
222+
))
223+
224+
TARGETS += $(BUILD_LIBSIMD_SORT)
225+
endif
207226
################################################################################

src/hotspot/cpu/aarch64/globals_aarch64.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,9 @@ define_pd_global(intx, InlineSmallCode, 1000);
127127
"Branch Protection to use: none, standard, pac-ret") \
128128
product(bool, AlwaysMergeDMB, true, DIAGNOSTIC, \
129129
"Always merge DMB instructions in code emission") \
130-
130+
product(bool, UseSVELibSimdSortForFP, false, EXPERIMENTAL, \
131+
"Use SVE-based LibSimdSort for float type on SVE supporting " \
132+
"machines") \
131133
// end of ARCH_FLAGS
132134

133135
#endif // CPU_AARCH64_GLOBALS_AARCH64_HPP

src/hotspot/cpu/aarch64/matcher_aarch64.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,13 @@
197197

198198
// Is SIMD sort supported for this CPU?
199199
static bool supports_simd_sort(BasicType bt) {
200+
// SIMD sort is supported only on SVE machines
201+
if (VM_Version::supports_sve()) {
202+
// Currently, only T_INT and T_FLOAT types are supported.
203+
// However, T_FLOAT is supported only if the experimental
204+
// flag - UseSVELibSimdSortForFP is enabled.
205+
return (bt == T_INT || (bt == T_FLOAT && UseSVELibSimdSortForFP));
206+
}
200207
return false;
201208
}
202209

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11873,6 +11873,10 @@ class StubGenerator: public StubCodeGenerator {
1187311873
StubRoutines::_montgomerySquare = g.generate_multiply();
1187411874
}
1187511875

11876+
// Load sve_sort library on supported hardware to enable SIMD sort and partition intrinsics
11877+
if (VM_Version::supports_sve()) {
11878+
(void)StubRoutines::try_load_simdsort("sve_sort", "sve_partition");
11879+
}
1187611880
#endif // COMPILER2
1187711881

1187811882
if (UseChaCha20Intrinsics) {

src/hotspot/cpu/x86/stubGenerator_x86_64.cpp

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4303,22 +4303,10 @@ void StubGenerator::generate_compiler_stubs() {
43034303
// Load x86_64_sort library on supported hardware to enable SIMD sort and partition intrinsics
43044304

43054305
if (VM_Version::supports_avx512dq() || VM_Version::supports_avx2()) {
4306-
void *libsimdsort = nullptr;
4307-
char ebuf_[1024];
4308-
char dll_name_simd_sort[JVM_MAXPATHLEN];
4309-
if (os::dll_locate_lib(dll_name_simd_sort, sizeof(dll_name_simd_sort), Arguments::get_dll_dir(), "simdsort")) {
4310-
libsimdsort = os::dll_load(dll_name_simd_sort, ebuf_, sizeof ebuf_);
4311-
}
4312-
// Get addresses for SIMD sort and partition routines
4313-
if (libsimdsort != nullptr) {
4314-
log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "simdsort" JNI_LIB_SUFFIX, p2i(libsimdsort));
4315-
4316-
os::snprintf_checked(ebuf_, sizeof(ebuf_), VM_Version::supports_avx512_simd_sort() ? "avx512_sort" : "avx2_sort");
4317-
StubRoutines::_array_sort = (address)os::dll_lookup(libsimdsort, ebuf_);
4318-
4319-
os::snprintf_checked(ebuf_, sizeof(ebuf_), VM_Version::supports_avx512_simd_sort() ? "avx512_partition" : "avx2_partition");
4320-
StubRoutines::_array_partition = (address)os::dll_lookup(libsimdsort, ebuf_);
4321-
}
4306+
const bool use_avx512 = VM_Version::supports_avx512_simd_sort();
4307+
const char* sort_sym = use_avx512 ? "avx512_sort" : "avx2_sort";
4308+
const char* partition_sym = use_avx512 ? "avx512_partition" : "avx2_partition";
4309+
(void)StubRoutines::try_load_simdsort(sort_sym, partition_sym);
43224310
}
43234311

43244312
#endif // COMPILER2

src/hotspot/share/runtime/stubRoutines.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,38 @@ StubRoutines::select_arraycopy_function(BasicType t, bool aligned, bool disjoint
469469
#undef RETURN_STUB_PARM
470470
}
471471

472+
bool StubRoutines::try_load_simdsort(const char* sort_sym, const char* partition_sym) {
473+
void* libsimdsort = nullptr;
474+
char ebuf_[1024];
475+
char dll_name_simd_sort[JVM_MAXPATHLEN];
476+
477+
if (os::dll_locate_lib(dll_name_simd_sort, sizeof(dll_name_simd_sort),
478+
Arguments::get_dll_dir(), "simdsort")) {
479+
libsimdsort = os::dll_load(dll_name_simd_sort, ebuf_, sizeof ebuf_);
480+
}
481+
482+
if (libsimdsort == nullptr) {
483+
return false;
484+
}
485+
486+
// Get addresses for SIMD sort and partition routines
487+
log_info(library)("Loaded library %s, handle " INTPTR_FORMAT,
488+
JNI_LIB_PREFIX "simdsort" JNI_LIB_SUFFIX, p2i(libsimdsort));
489+
address sort_addr = (address)os::dll_lookup(libsimdsort, sort_sym);
490+
address partition_addr = (address)os::dll_lookup(libsimdsort, partition_sym);
491+
492+
if (sort_addr == nullptr || partition_addr == nullptr) {
493+
log_warning(library)("libsimdsort missing symbols: %s=" INTPTR_FORMAT ", %s=" INTPTR_FORMAT,
494+
sort_sym, p2i(sort_addr), partition_sym, p2i(partition_addr));
495+
// If either of the addresses are null, return false.
496+
return false;
497+
}
498+
499+
StubRoutines::_array_sort = sort_addr;
500+
StubRoutines::_array_partition = partition_addr;
501+
return true;
502+
}
503+
472504
UnsafeMemoryAccessMark::UnsafeMemoryAccessMark(StubCodeGenerator* cgen, bool add_entry, bool continue_at_scope_end, address error_exit_pc) {
473505
_cgen = cgen;
474506
_ucm_entry = nullptr;

src/hotspot/share/runtime/stubRoutines.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "code/codeBlob.hpp"
2929
#include "memory/allocation.hpp"
3030
#include "prims/vectorSupport.hpp"
31+
#include "runtime/arguments.hpp"
3132
#include "runtime/frame.hpp"
3233
#include "runtime/mutexLocker.hpp"
3334
#include "runtime/stubCodeGenerator.hpp"
@@ -362,6 +363,9 @@ class StubRoutines: AllStatic {
362363
static void arrayof_oop_copy (HeapWord* src, HeapWord* dest, size_t count);
363364
static void arrayof_oop_copy_uninit(HeapWord* src, HeapWord* dest, size_t count);
364365

366+
// SIMD sort support. This method resolves the symbols - sort_sym, partition_sym
367+
// and on success sets the StubRoutines::_array_sort/_array_partition and returns true.
368+
static bool try_load_simdsort(const char* sort_sym, const char* partition_sym);
365369
};
366370

367371
#endif // SHARE_RUNTIME_STUBROUTINES_HPP
Lines changed: 21 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/*
22
* Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
33
* Copyright (c) 2021 Serge Sans Paille. All rights reserved.
4+
* Copyright 2025 Arm Limited and/or its affiliates.
45
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
56
*
67
* This code is free software; you can redistribute it and/or modify it
@@ -23,66 +24,31 @@
2324
*
2425
*/
2526

26-
// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
27+
#ifndef AARCH64_SVE_PIVOT_SELECTION_HPP
28+
#define AARCH64_SVE_PIVOT_SELECTION_HPP
2729

28-
template <typename vtype, typename mm_t>
29-
X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b);
30+
#include <algorithm>
31+
#include "sve-config.hpp"
3032

31-
template <typename vtype, typename type_t>
32-
X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr, const arrsize_t left,
33-
const arrsize_t right) {
34-
using reg_t = typename vtype::reg_t;
35-
type_t samples[vtype::numlanes];
36-
arrsize_t delta = (right - left) / vtype::numlanes;
37-
for (int i = 0; i < vtype::numlanes; i++) {
38-
samples[i] = arr[left + i * delta];
39-
}
40-
reg_t rand_vec = vtype::loadu(samples);
41-
reg_t sort = vtype::sort_vec(rand_vec);
42-
43-
return ((type_t *)&sort)[vtype::numlanes / 2];
44-
}
33+
/* <TODO> The current pivot selection method follows median-of-three method.
34+
* Possible improvements could be the usage of sorting network (Compare and exchange sorting)
35+
* for larger arrays.
36+
*/
4537

4638
template <typename vtype, typename type_t>
47-
X86_SIMD_SORT_INLINE type_t get_pivot_blocks(type_t *arr, const arrsize_t left,
48-
const arrsize_t right) {
49-
if (right - left <= 1024) {
50-
return get_pivot<vtype>(arr, left, right);
51-
}
52-
53-
using reg_t = typename vtype::reg_t;
54-
constexpr int numVecs = 5;
55-
56-
arrsize_t width = (right - vtype::numlanes) - left;
57-
arrsize_t delta = width / numVecs;
39+
static inline type_t get_pivot_blocks(type_t* arr, const arrsize_t left, const arrsize_t right) {
40+
const arrsize_t len = right - left;
41+
if (len < 64) return arr[left];
5842

59-
reg_t vecs[numVecs];
60-
// Load data
61-
for (int i = 0; i < numVecs; i++) {
62-
vecs[i] = vtype::loadu(arr + left + delta * i);
63-
}
43+
const arrsize_t mid = left + (len / 2);
44+
const type_t a = arr[left];
45+
const type_t b = arr[mid];
46+
const type_t c = arr[right - 1];
6447

65-
// Implement sorting network (from
66-
// https://bertdobbelaere.github.io/sorting_networks.html)
67-
COEX<vtype>(vecs[0], vecs[3]);
68-
COEX<vtype>(vecs[1], vecs[4]);
48+
const type_t min_ab = std::min(a, b);
49+
const type_t max_ab = std::max(a, b);
6950

70-
COEX<vtype>(vecs[0], vecs[2]);
71-
COEX<vtype>(vecs[1], vecs[3]);
72-
73-
COEX<vtype>(vecs[0], vecs[1]);
74-
COEX<vtype>(vecs[2], vecs[4]);
75-
76-
COEX<vtype>(vecs[1], vecs[2]);
77-
COEX<vtype>(vecs[3], vecs[4]);
78-
79-
COEX<vtype>(vecs[2], vecs[3]);
80-
81-
// Calculate median of the middle vector
82-
reg_t &vec = vecs[numVecs / 2];
83-
vec = vtype::sort_vec(vec);
84-
85-
type_t data[vtype::numlanes];
86-
vtype::storeu(data, vec);
87-
return data[vtype::numlanes / 2];
51+
return std::min(max_ab, std::max(min_ab, c));
8852
}
53+
54+
#endif // AARCH64_SVE_PIVOT_SELECTION_HPP

src/java.base/linux/native/libsimdsort/aarch64/simdsort-support.hpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*
22
* Copyright (c) 2023 Intel Corporation. All rights reserved.
3+
* Copyright 2025 Arm Limited and/or its affiliates.
34
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
45
*
56
* This code is free software; you can redistribute it and/or modify it
@@ -30,10 +31,10 @@
3031
#undef assert
3132
#define assert(cond, msg) { if (!(cond)) { fprintf(stderr, "assert fails %s %d: %s\n", __FILE__, __LINE__, msg); abort(); }}
3233

33-
34-
// GCC >= 9.1 is needed to build AVX2 portions of libsimdsort using C++17 features
35-
#if defined(_LP64) && (defined(__GNUC__) && ((__GNUC__ > 9) || ((__GNUC__ == 9) && (__GNUC_MINOR__ >= 1))))
34+
// GCC >= 10.1 is required for a full support of ARM SVE ACLE intrinsics (which also includes the header file - arm_sve.h)
35+
#if defined(__aarch64__) && defined(_LP64) && defined(__GNUC__) && \
36+
((__GNUC__ > 10) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 1))
3637
#define __SIMDSORT_SUPPORTED_LINUX
3738
#endif
3839

39-
#endif //SIMDSORT_SUPPORT_HPP
40+
#endif //SIMDSORT_SUPPORT_HPP

0 commit comments

Comments
 (0)