|
1 | 1 | /* |
2 | 2 | * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved. |
3 | 3 | * Copyright (c) 2021 Serge Sans Paille. All rights reserved. |
| 4 | + * Copyright 2025 Arm Limited and/or its affiliates. |
4 | 5 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
5 | 6 | * |
6 | 7 | * This code is free software; you can redistribute it and/or modify it |
|
23 | 24 | * |
24 | 25 | */ |
25 | 26 |
|
26 | | -// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort) |
| 27 | +#ifndef AARCH64_SVE_PIVOT_SELECTION_HPP |
| 28 | +#define AARCH64_SVE_PIVOT_SELECTION_HPP |
27 | 29 |
|
28 | | -template <typename vtype, typename mm_t> |
29 | | -X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b); |
| 30 | +#include <algorithm> |
| 31 | +#include "sve-config.hpp" |
30 | 32 |
|
31 | | -template <typename vtype, typename type_t> |
32 | | -X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr, const arrsize_t left, |
33 | | - const arrsize_t right) { |
34 | | - using reg_t = typename vtype::reg_t; |
35 | | - type_t samples[vtype::numlanes]; |
36 | | - arrsize_t delta = (right - left) / vtype::numlanes; |
37 | | - for (int i = 0; i < vtype::numlanes; i++) { |
38 | | - samples[i] = arr[left + i * delta]; |
39 | | - } |
40 | | - reg_t rand_vec = vtype::loadu(samples); |
41 | | - reg_t sort = vtype::sort_vec(rand_vec); |
42 | | - |
43 | | - return ((type_t *)&sort)[vtype::numlanes / 2]; |
44 | | -} |
| 33 | +/* <TODO> The current pivot selection method follows median-of-three method. |
| 34 | + * Possible improvements could be the usage of sorting network (Compare and exchange sorting) |
| 35 | + * for larger arrays. |
| 36 | + */ |
45 | 37 |
|
46 | 38 | template <typename vtype, typename type_t> |
47 | | -X86_SIMD_SORT_INLINE type_t get_pivot_blocks(type_t *arr, const arrsize_t left, |
48 | | - const arrsize_t right) { |
49 | | - if (right - left <= 1024) { |
50 | | - return get_pivot<vtype>(arr, left, right); |
51 | | - } |
52 | | - |
53 | | - using reg_t = typename vtype::reg_t; |
54 | | - constexpr int numVecs = 5; |
55 | | - |
56 | | - arrsize_t width = (right - vtype::numlanes) - left; |
57 | | - arrsize_t delta = width / numVecs; |
| 39 | +static inline type_t get_pivot_blocks(type_t* arr, const arrsize_t left, const arrsize_t right) { |
| 40 | + const arrsize_t len = right - left; |
| 41 | + if (len < 64) return arr[left]; |
58 | 42 |
|
59 | | - reg_t vecs[numVecs]; |
60 | | - // Load data |
61 | | - for (int i = 0; i < numVecs; i++) { |
62 | | - vecs[i] = vtype::loadu(arr + left + delta * i); |
63 | | - } |
| 43 | + const arrsize_t mid = left + (len / 2); |
| 44 | + const type_t a = arr[left]; |
| 45 | + const type_t b = arr[mid]; |
| 46 | + const type_t c = arr[right - 1]; |
64 | 47 |
|
65 | | - // Implement sorting network (from |
66 | | - // https://bertdobbelaere.github.io/sorting_networks.html) |
67 | | - COEX<vtype>(vecs[0], vecs[3]); |
68 | | - COEX<vtype>(vecs[1], vecs[4]); |
| 48 | + const type_t min_ab = std::min(a, b); |
| 49 | + const type_t max_ab = std::max(a, b); |
69 | 50 |
|
70 | | - COEX<vtype>(vecs[0], vecs[2]); |
71 | | - COEX<vtype>(vecs[1], vecs[3]); |
72 | | - |
73 | | - COEX<vtype>(vecs[0], vecs[1]); |
74 | | - COEX<vtype>(vecs[2], vecs[4]); |
75 | | - |
76 | | - COEX<vtype>(vecs[1], vecs[2]); |
77 | | - COEX<vtype>(vecs[3], vecs[4]); |
78 | | - |
79 | | - COEX<vtype>(vecs[2], vecs[3]); |
80 | | - |
81 | | - // Calculate median of the middle vector |
82 | | - reg_t &vec = vecs[numVecs / 2]; |
83 | | - vec = vtype::sort_vec(vec); |
84 | | - |
85 | | - type_t data[vtype::numlanes]; |
86 | | - vtype::storeu(data, vec); |
87 | | - return data[vtype::numlanes / 2]; |
| 51 | + return std::min(max_ab, std::max(min_ab, c)); |
88 | 52 | } |
| 53 | + |
| 54 | +#endif // AARCH64_SVE_PIVOT_SELECTION_HPP |
0 commit comments