Skip to content

Commit 0c3ab11

Browse files
AArch64 SVE implementation for Arrays.sort
1 parent a14a3a0 commit 0c3ab11

File tree

14 files changed

+686
-694
lines changed

14 files changed

+686
-694
lines changed

make/modules/java.base/Lib.gmk

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@ ifeq ($(ENABLE_FALLBACK_LINKER), true)
187187
TARGETS += $(BUILD_LIBFALLBACKLINKER)
188188
endif
189189

190+
SIMDSORT_BASE_DIR := $(TOPDIR)/src/java.base/linux/native/libsimdsort
190191
ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)+$(filter $(TOOLCHAIN_TYPE), gcc), true+true+true+gcc)
191192
##############################################################################
192193
## Build libsimdsort
@@ -196,6 +197,7 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)
196197
NAME := simdsort, \
197198
LINK_TYPE := C++, \
198199
OPTIMIZATION := HIGH, \
200+
SRC := $(SIMDSORT_BASE_DIR)/x86, \
199201
CXXFLAGS := -std=c++17, \
200202
DISABLED_WARNINGS_gcc := unused-variable, \
201203
LIBS_linux := $(LIBM), \
@@ -204,4 +206,21 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)
204206
TARGETS += $(BUILD_LIBSIMD_SORT)
205207
endif
206208

209+
ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, aarch64)+$(INCLUDE_COMPILER2)+$(filter $(TOOLCHAIN_TYPE), gcc), true+true+true+gcc)
210+
$(eval $(call SetupJdkLibrary, BUILD_LIBSIMD_SORT, \
211+
NAME := simdsort, \
212+
TOOLCHAIN := TOOLCHAIN_LINK_CXX, \
213+
OPTIMIZATION := HIGH, \
214+
SRC := $(SIMDSORT_BASE_DIR)/aarch64, \
215+
CFLAGS := $(CFLAGS_JDKLIB) -march=armv8.2-a+sve, \
216+
CXXFLAGS := $(CXXFLAGS_JDKLIB) -march=armv8.2-a+sve -std=c++17, \
217+
LDFLAGS := $(LDFLAGS_JDKLIB) \
218+
$(call SET_SHARED_LIBRARY_ORIGIN), \
219+
LIBS := $(LIBCXX), \
220+
DISABLED_WARNINGS_gcc := unused-variable, \
221+
LIBS_linux := -lc -lm -ldl, \
222+
))
223+
224+
TARGETS += $(BUILD_LIBSIMD_SORT)
225+
endif
207226
################################################################################

src/hotspot/cpu/aarch64/globals_aarch64.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,9 @@ define_pd_global(intx, InlineSmallCode, 1000);
127127
"Branch Protection to use: none, standard, pac-ret") \
128128
product(bool, AlwaysMergeDMB, true, DIAGNOSTIC, \
129129
"Always merge DMB instructions in code emission") \
130-
130+
product(bool, UseSVELibSimdSortForFP, false, EXPERIMENTAL, \
131+
"Use SVE-based LibSimdSort for float type on SVE supporting " \
132+
"machines") \
131133
// end of ARCH_FLAGS
132134

133135
#endif // CPU_AARCH64_GLOBALS_AARCH64_HPP

src/hotspot/cpu/aarch64/matcher_aarch64.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,13 @@
197197

198198
// Is SIMD sort supported for this CPU?
199199
static bool supports_simd_sort(BasicType bt) {
200+
// SIMD sort is supported only on SVE machines
201+
if (VM_Version::supports_sve()) {
202+
// Currently, only T_INT and T_FLOAT types are supported.
203+
// However, T_FLOAT is supported only if the experimental
204+
// flag - UseSVELibSimdSortForFP is enabled.
205+
return (bt == T_INT || (bt == T_FLOAT && UseSVELibSimdSortForFP));
206+
}
200207
return false;
201208
}
202209

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11873,6 +11873,10 @@ class StubGenerator: public StubCodeGenerator {
1187311873
StubRoutines::_montgomerySquare = g.generate_multiply();
1187411874
}
1187511875

11876+
// Load sve_sort library on supported hardware to enable SIMD sort and partition intrinsics
11877+
if (VM_Version::supports_sve()) {
11878+
(void)StubRoutines::try_load_simdsort("sve_sort", "sve_partition");
11879+
}
1187611880
#endif // COMPILER2
1187711881

1187811882
if (UseChaCha20Intrinsics) {

src/hotspot/cpu/x86/stubGenerator_x86_64.cpp

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4303,22 +4303,10 @@ void StubGenerator::generate_compiler_stubs() {
43034303
// Load x86_64_sort library on supported hardware to enable SIMD sort and partition intrinsics
43044304

43054305
if (VM_Version::supports_avx512dq() || VM_Version::supports_avx2()) {
4306-
void *libsimdsort = nullptr;
4307-
char ebuf_[1024];
4308-
char dll_name_simd_sort[JVM_MAXPATHLEN];
4309-
if (os::dll_locate_lib(dll_name_simd_sort, sizeof(dll_name_simd_sort), Arguments::get_dll_dir(), "simdsort")) {
4310-
libsimdsort = os::dll_load(dll_name_simd_sort, ebuf_, sizeof ebuf_);
4311-
}
4312-
// Get addresses for SIMD sort and partition routines
4313-
if (libsimdsort != nullptr) {
4314-
log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "simdsort" JNI_LIB_SUFFIX, p2i(libsimdsort));
4315-
4316-
os::snprintf_checked(ebuf_, sizeof(ebuf_), VM_Version::supports_avx512_simd_sort() ? "avx512_sort" : "avx2_sort");
4317-
StubRoutines::_array_sort = (address)os::dll_lookup(libsimdsort, ebuf_);
4318-
4319-
os::snprintf_checked(ebuf_, sizeof(ebuf_), VM_Version::supports_avx512_simd_sort() ? "avx512_partition" : "avx2_partition");
4320-
StubRoutines::_array_partition = (address)os::dll_lookup(libsimdsort, ebuf_);
4321-
}
4306+
const bool use_avx512 = VM_Version::supports_avx512_simd_sort();
4307+
const char* sort_sym = use_avx512 ? "avx512_sort" : "avx2_sort";
4308+
const char* partition_sym = use_avx512 ? "avx512_partition" : "avx2_partition";
4309+
(void)StubRoutines::try_load_simdsort(sort_sym, partition_sym);
43224310
}
43234311

43244312
#endif // COMPILER2

src/hotspot/share/runtime/stubRoutines.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,38 @@ StubRoutines::select_arraycopy_function(BasicType t, bool aligned, bool disjoint
469469
#undef RETURN_STUB_PARM
470470
}
471471

472+
bool StubRoutines::try_load_simdsort(const char* sort_sym, const char* partition_sym) {
473+
void* libsimdsort = nullptr;
474+
char ebuf_[1024];
475+
char dll_name_simd_sort[JVM_MAXPATHLEN];
476+
477+
if (os::dll_locate_lib(dll_name_simd_sort, sizeof(dll_name_simd_sort),
478+
Arguments::get_dll_dir(), "simdsort")) {
479+
libsimdsort = os::dll_load(dll_name_simd_sort, ebuf_, sizeof ebuf_);
480+
}
481+
482+
if (libsimdsort == nullptr) {
483+
return false;
484+
}
485+
486+
// Get addresses for SIMD sort and partition routines
487+
log_info(library)("Loaded library %s, handle " INTPTR_FORMAT,
488+
JNI_LIB_PREFIX "simdsort" JNI_LIB_SUFFIX, p2i(libsimdsort));
489+
address sort_addr = (address)os::dll_lookup(libsimdsort, sort_sym);
490+
address partition_addr = (address)os::dll_lookup(libsimdsort, partition_sym);
491+
492+
if (sort_addr == nullptr || partition_addr == nullptr) {
493+
log_warning(library)("libsimdsort missing symbols: %s=" INTPTR_FORMAT ", %s=" INTPTR_FORMAT,
494+
sort_sym, p2i(sort_addr), partition_sym, p2i(partition_addr));
495+
// If either of the addresses are null, return false.
496+
return false;
497+
}
498+
499+
StubRoutines::_array_sort = sort_addr;
500+
StubRoutines::_array_partition = partition_addr;
501+
return true;
502+
}
503+
472504
UnsafeMemoryAccessMark::UnsafeMemoryAccessMark(StubCodeGenerator* cgen, bool add_entry, bool continue_at_scope_end, address error_exit_pc) {
473505
_cgen = cgen;
474506
_ucm_entry = nullptr;

src/hotspot/share/runtime/stubRoutines.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "code/codeBlob.hpp"
2929
#include "memory/allocation.hpp"
3030
#include "prims/vectorSupport.hpp"
31+
#include "runtime/arguments.hpp"
3132
#include "runtime/frame.hpp"
3233
#include "runtime/mutexLocker.hpp"
3334
#include "runtime/stubCodeGenerator.hpp"
@@ -362,6 +363,9 @@ class StubRoutines: AllStatic {
362363
static void arrayof_oop_copy (HeapWord* src, HeapWord* dest, size_t count);
363364
static void arrayof_oop_copy_uninit(HeapWord* src, HeapWord* dest, size_t count);
364365

366+
// SIMD sort support. This method resolves the symbols - sort_sym, partition_sym
367+
// and on success sets the StubRoutines::_array_sort/_array_partition and returns true.
368+
static bool try_load_simdsort(const char* sort_sym, const char* partition_sym);
365369
};
366370

367371
#endif // SHARE_RUNTIME_STUBROUTINES_HPP
Lines changed: 21 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/*
22
* Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
33
* Copyright (c) 2021 Serge Sans Paille. All rights reserved.
4+
* Copyright 2025 Arm Limited and/or its affiliates.
45
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
56
*
67
* This code is free software; you can redistribute it and/or modify it
@@ -23,66 +24,31 @@
2324
*
2425
*/
2526

26-
// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
27+
#ifndef AARCH64_SVE_PIVOT_SELECTION_HPP
28+
#define AARCH64_SVE_PIVOT_SELECTION_HPP
2729

28-
template <typename vtype, typename mm_t>
29-
X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b);
30+
#include <algorithm>
31+
#include "sve-config.hpp"
3032

31-
template <typename vtype, typename type_t>
32-
X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr, const arrsize_t left,
33-
const arrsize_t right) {
34-
using reg_t = typename vtype::reg_t;
35-
type_t samples[vtype::numlanes];
36-
arrsize_t delta = (right - left) / vtype::numlanes;
37-
for (int i = 0; i < vtype::numlanes; i++) {
38-
samples[i] = arr[left + i * delta];
39-
}
40-
reg_t rand_vec = vtype::loadu(samples);
41-
reg_t sort = vtype::sort_vec(rand_vec);
42-
43-
return ((type_t *)&sort)[vtype::numlanes / 2];
44-
}
33+
/* <TODO> The current pivot selection method follows median-of-three method.
34+
* Possible improvements could be the usage of sorting network (Compare and exchange sorting)
35+
* for larger arrays.
36+
*/
4537

4638
template <typename vtype, typename type_t>
47-
X86_SIMD_SORT_INLINE type_t get_pivot_blocks(type_t *arr, const arrsize_t left,
48-
const arrsize_t right) {
49-
if (right - left <= 1024) {
50-
return get_pivot<vtype>(arr, left, right);
51-
}
52-
53-
using reg_t = typename vtype::reg_t;
54-
constexpr int numVecs = 5;
55-
56-
arrsize_t width = (right - vtype::numlanes) - left;
57-
arrsize_t delta = width / numVecs;
39+
static inline type_t get_pivot_blocks(type_t* arr, const arrsize_t left, const arrsize_t right) {
40+
const arrsize_t len = right - left;
41+
if (len < 64) return arr[left];
5842

59-
reg_t vecs[numVecs];
60-
// Load data
61-
for (int i = 0; i < numVecs; i++) {
62-
vecs[i] = vtype::loadu(arr + left + delta * i);
63-
}
43+
const arrsize_t mid = left + (len / 2);
44+
const type_t a = arr[left];
45+
const type_t b = arr[mid];
46+
const type_t c = arr[right - 1];
6447

65-
// Implement sorting network (from
66-
// https://bertdobbelaere.github.io/sorting_networks.html)
67-
COEX<vtype>(vecs[0], vecs[3]);
68-
COEX<vtype>(vecs[1], vecs[4]);
48+
const type_t min_ab = std::min(a, b);
49+
const type_t max_ab = std::max(a, b);
6950

70-
COEX<vtype>(vecs[0], vecs[2]);
71-
COEX<vtype>(vecs[1], vecs[3]);
72-
73-
COEX<vtype>(vecs[0], vecs[1]);
74-
COEX<vtype>(vecs[2], vecs[4]);
75-
76-
COEX<vtype>(vecs[1], vecs[2]);
77-
COEX<vtype>(vecs[3], vecs[4]);
78-
79-
COEX<vtype>(vecs[2], vecs[3]);
80-
81-
// Calculate median of the middle vector
82-
reg_t &vec = vecs[numVecs / 2];
83-
vec = vtype::sort_vec(vec);
84-
85-
type_t data[vtype::numlanes];
86-
vtype::storeu(data, vec);
87-
return data[vtype::numlanes / 2];
51+
return std::min(max_ab, std::max(min_ab, c));
8852
}
53+
54+
#endif // AARCH64_SVE_PIVOT_SELECTION_HPP

src/java.base/linux/native/libsimdsort/aarch64/simdsort-support.hpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*
22
* Copyright (c) 2023 Intel Corporation. All rights reserved.
3+
* Copyright 2025 Arm Limited and/or its affiliates.
34
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
45
*
56
* This code is free software; you can redistribute it and/or modify it
@@ -30,10 +31,10 @@
3031
#undef assert
3132
#define assert(cond, msg) { if (!(cond)) { fprintf(stderr, "assert fails %s %d: %s\n", __FILE__, __LINE__, msg); abort(); }}
3233

33-
34-
// GCC >= 9.1 is needed to build AVX2 portions of libsimdsort using C++17 features
35-
#if defined(_LP64) && (defined(__GNUC__) && ((__GNUC__ > 9) || ((__GNUC__ == 9) && (__GNUC_MINOR__ >= 1))))
34+
// GCC >= 10.1 is required for a full support of ARM SVE ACLE intrinsics (which also includes the header file - arm_sve.h)
35+
#if defined(__aarch64__) && defined(_LP64) && defined(__GNUC__) && \
36+
((__GNUC__ > 10) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 1))
3637
#define __SIMDSORT_SUPPORTED_LINUX
3738
#endif
3839

39-
#endif //SIMDSORT_SUPPORT_HPP
40+
#endif //SIMDSORT_SUPPORT_HPP

0 commit comments

Comments
 (0)