|
1 | 1 | #pragma once |
2 | 2 | #include "SIMD.hpp" |
3 | | -#include <iostream> |
4 | 3 | #include <cmath> |
| 4 | +#include <iostream> |
5 | 5 | #include <vector> |
6 | | - |
| 6 | +#ifdef USE_NUMA |
| 7 | +# include <numa.h> |
| 8 | +# include <numaif.h> |
| 9 | +# include <sched.h> // for sched_getcpu() |
| 10 | +#endif |
7 | 11 | #if defined(USE_KNL) |
8 | | -#include <hbwmalloc.h> |
| 12 | +# include <hbwmalloc.h> |
9 | 13 | #endif |
10 | 14 | /** |
11 | 15 | * @brief Aligned memory allocator for high-performance computing. |
12 | | - * |
13 | | - * This allocator provides memory aligned to a specified boundary, ensuring compatibility with SIMD instructions and optimal cache usage. |
14 | | - * On Intel KNL (Knights Landing) architectures, this allocator automatically uses the high-bandwidth MCDRAM via `hbw_posix_memalign` |
15 | | - * if the macro `USE_KNL` is defined. Otherwise, standard `posix_memalign` is used. |
16 | | - * |
17 | | - * This is particularly useful for vectorized numerical libraries, where memory alignment is essential for |
18 | | - * instruction-level parallelism (e.g., AVX, SSE, AVX-512). |
19 | | - * |
| 16 | + * |
| 17 | + * This allocator provides memory aligned to a specified boundary, ensuring compatibility with SIMD |
| 18 | + * instructions and optimal cache usage. On Intel KNL (Knights Landing) architectures, this |
| 19 | + * allocator automatically uses the high-bandwidth MCDRAM via `hbw_posix_memalign` if the macro |
| 20 | + * `USE_KNL` is defined. Otherwise, standard `posix_memalign` is used. |
| 21 | + * |
| 22 | + * This is particularly useful for vectorized numerical libraries, where memory alignment is |
| 23 | + * essential for instruction-level parallelism (e.g., AVX, SSE, AVX-512). |
| 24 | + * |
20 | 25 | * @tparam T Type of the objects being allocated. |
21 | | - * @tparam Alignment Memory alignment in bytes. Must be a power of two and compatible with the ISA in use (e.g., 32 for AVX256). |
| 26 | + * @tparam Alignment Memory alignment in bytes. Must be a power of two and compatible with the ISA |
| 27 | + * in use (e.g., 32 for AVX256). |
22 | 28 | */ |
23 | | -template <typename T, std::size_t Alignment> |
24 | | -struct AlignedAllocator { |
25 | | - using value_type = T; |
26 | | - using pointer = T*; |
27 | | - using const_pointer = const T*; |
28 | | - using reference = T&; |
29 | | - using const_reference = const T&; |
30 | | - using size_type = std::size_t; |
31 | | - using difference_type = std::ptrdiff_t; |
| 29 | +template <typename T, std::size_t Alignment> struct AlignedAllocator { |
| 30 | + using value_type = T; |
| 31 | + using pointer = T *; |
| 32 | + using const_pointer = const T *; |
| 33 | + using reference = T &; |
| 34 | + using const_reference = const T &; |
| 35 | + using size_type = std::size_t; |
| 36 | + using difference_type = std::ptrdiff_t; |
| 37 | + |
| 38 | + /** |
| 39 | + * @brief Rebinding structure for allocator traits. |
| 40 | + * |
| 41 | + * Allows conversion of an AlignedAllocator<T, Alignment> to AlignedAllocator<U, Alignment>, |
| 42 | + * which is required by STL containers during type conversions. |
| 43 | + * |
| 44 | + * @tparam U New type for rebind. |
| 45 | + */ |
| 46 | + template <typename U> struct rebind { |
| 47 | + using other = AlignedAllocator<U, Alignment>; |
| 48 | + }; |
| 49 | + /** |
| 50 | + * @brief Conversion constructor from another allocator of different type. |
| 51 | + * |
| 52 | + * Required by the STL allocator model. Does nothing as this allocator is stateless. |
| 53 | + * |
| 54 | + * @tparam U Other type. |
| 55 | + * @param other The other allocator. |
| 56 | + */ |
| 57 | + AlignedAllocator() noexcept = default; |
| 58 | + template <typename U> |
| 59 | + /** |
| 60 | + * @brief Default constructor. |
| 61 | + * |
| 62 | + * Stateless and noexcept. |
| 63 | + */ |
| 64 | + AlignedAllocator(const AlignedAllocator<U, Alignment> &) noexcept {} |
| 65 | + |
| 66 | + /** |
| 67 | + * @brief Allocates aligned memory for n elements of type T. |
| 68 | + * |
| 69 | + * The alignment is guaranteed to be at least `Alignment` bytes. On KNL platforms, |
| 70 | + * high-bandwidth memory (HBM) will be used via libhbw. |
| 71 | + * |
| 72 | + * On NUMA systems, memory is allocated on the node local to the calling thread. |
| 73 | + * |
| 74 | + * @param n Number of elements to allocate. |
| 75 | + * @param node_id NUMA node to allocate on (-1 = auto/local or no NUMA). |
| 76 | + * @return T* Pointer to aligned memory block. |
| 77 | + * |
| 78 | + * @throws std::bad_alloc If memory allocation fails. |
| 79 | + */ |
| 80 | + [[nodiscard]] T *allocate(std::size_t n, int node_id = -1) { |
| 81 | + void *ptr = nullptr; |
| 82 | + const std::size_t alloc_size = n * sizeof(T) + Alignment; |
32 | 83 |
|
33 | | - /** |
34 | | - * @brief Rebinding structure for allocator traits. |
35 | | - * |
36 | | - * Allows conversion of an AlignedAllocator<T, Alignment> to AlignedAllocator<U, Alignment>, |
37 | | - * which is required by STL containers during type conversions. |
38 | | - * |
39 | | - * @tparam U New type for rebind. |
40 | | - */ |
41 | | - template <typename U> |
42 | | - struct rebind { |
43 | | - using other = AlignedAllocator<U, Alignment>; |
44 | | - }; |
45 | | - /** |
46 | | - * @brief Conversion constructor from another allocator of different type. |
47 | | - * |
48 | | - * Required by the STL allocator model. Does nothing as this allocator is stateless. |
49 | | - * |
50 | | - * @tparam U Other type. |
51 | | - * @param other The other allocator. |
52 | | - */ |
53 | | - AlignedAllocator() noexcept = default; |
54 | | - template <typename U> |
55 | | - /** |
56 | | - * @brief Default constructor. |
57 | | - * |
58 | | - * Stateless and noexcept. |
59 | | - */ |
60 | | - AlignedAllocator(const AlignedAllocator<U, Alignment>&) noexcept {} |
61 | | - /** |
62 | | - * @brief Allocates aligned memory for n elements of type T. |
63 | | - * |
64 | | - * The alignment is guaranteed to be at least `Alignment` bytes. On KNL platforms, |
65 | | - * high-bandwidth memory (HBM) will be used via libhbw. |
66 | | - * |
67 | | - * @param n Number of elements to allocate. |
68 | | - * @return T* Pointer to aligned memory block. |
69 | | - * |
70 | | - * @throws std::bad_alloc If memory allocation fails. |
71 | | - */ |
72 | | - [[nodiscard]] T* allocate(std::size_t n) { |
73 | | - void* ptr = nullptr; |
74 | | - const std::size_t alloc_size = n * sizeof(T) + Alignment; |
| 84 | + static_assert((Alignment & (Alignment - 1)) == 0, "Alignment must be power of two."); |
| 85 | + static_assert(Alignment >= alignof(T), "Alignment must be >= alignof(T)."); |
75 | 86 |
|
76 | 87 | #if defined(USE_KNL) |
77 | | - if (hbw_posix_memalign(&ptr, alignment, alloc_size) != 0) |
78 | | - throw std::bad_alloc(); |
| 88 | + if (hbw_posix_memalign(&ptr, Alignment, alloc_size) != 0) |
| 89 | + throw std::bad_alloc(); |
| 90 | + |
| 91 | +#elif defined(USE_NUMA) |
| 92 | + if (numa_available() >= 0) { |
| 93 | + int target_node = node_id; |
| 94 | + if (target_node < 0) { |
| 95 | + int cpu = sched_getcpu(); |
| 96 | + target_node = numa_node_of_cpu(cpu); |
| 97 | + } |
| 98 | + |
| 99 | + long node_mem = numa_node_size64(target_node, nullptr); |
| 100 | + if (node_mem <= 0) { |
| 101 | + for (int n = 0; n <= numa_max_node(); ++n) { |
| 102 | + if (numa_node_size64(n, nullptr) > 0) { |
| 103 | + target_node = n; |
| 104 | + break; |
| 105 | + } |
| 106 | + } |
| 107 | + // std::cerr << "[NUMA WARN] Node " << target_node |
| 108 | + // << " has no memory, fallback to node " << target_node << "\n"; |
| 109 | + } |
| 110 | + |
| 111 | + ptr = numa_alloc_onnode(alloc_size, target_node); |
| 112 | + if (!ptr) { |
| 113 | + std::cerr << "[NUMA WARN] numa_alloc_onnode failed, fallback to posix_memalign()\n"; |
| 114 | + if (posix_memalign(&ptr, Alignment, alloc_size) != 0) |
| 115 | + throw std::bad_alloc(); |
| 116 | + } |
| 117 | + } else if (posix_memalign(&ptr, Alignment, alloc_size) != 0) { |
| 118 | + throw std::bad_alloc(); |
| 119 | + } |
| 120 | + |
79 | 121 | #else |
80 | | - if (posix_memalign(&ptr, Alignment, alloc_size) != 0) |
81 | | - throw std::bad_alloc(); |
| 122 | + if (posix_memalign(&ptr, Alignment, alloc_size) != 0) |
| 123 | + throw std::bad_alloc(); |
82 | 124 | #endif |
83 | | - return reinterpret_cast<T*>(ptr); |
84 | | - } |
85 | | - /** |
86 | | - * @brief Deallocates memory previously allocated with allocate(). |
87 | | - * |
88 | | - * On KNL platforms, this calls `hbw_free`. Otherwise, standard `free` is used. |
89 | | - * |
90 | | - * @param p Pointer to memory to deallocate. |
91 | | - * @param size Number of elements (not used). |
92 | | - */ |
93 | | - void deallocate(T* p, std::size_t) noexcept { |
| 125 | + return reinterpret_cast<T *>(ptr); |
| 126 | + } |
| 127 | + |
| 128 | + /** |
| 129 | + * @brief Deallocates memory previously allocated with allocate(). |
| 130 | + * |
| 131 | + * On KNL platforms, this calls `hbw_free`. On NUMA systems, it uses `numa_free()`. Otherwise, |
| 132 | + * standard `free()` is used. |
| 133 | + * |
| 134 | + * @param p Pointer to memory to deallocate. |
| 135 | + * @param n Number of elements. |
| 136 | + */ |
| 137 | + void deallocate(T *p, std::size_t n) noexcept { |
94 | 138 | #if defined(USE_KNL) |
95 | | - hbw_free(p); |
| 139 | + hbw_free(p); |
| 140 | +#elif defined(USE_NUMA) |
| 141 | + if (numa_available() >= 0) |
| 142 | + numa_free(p, n * sizeof(T)); |
| 143 | + else |
| 144 | + free(p); |
96 | 145 | #else |
97 | | - free(p); |
| 146 | + free(p); |
98 | 147 | #endif |
99 | | - } |
| 148 | + } |
100 | 149 | }; |
| 150 | + |
101 | 151 | /** |
102 | 152 | * @brief Type alias for a std::vector with aligned memory allocation. |
103 | | - * |
| 153 | + * |
104 | 154 | * This provides an aligned vector container compatible with SIMD usage. |
105 | 155 | * The alignment used is determined by the macro `ALIGN`, typically set |
106 | 156 | * based on the SIMD instruction set width (e.g., 16 for SSE, 32 for AVX, 64 for AVX-512). |
107 | | - * |
| 157 | + * |
108 | 158 | * @tparam K Type of the elements. |
109 | 159 | */ |
110 | | -template<typename K> |
111 | | -using aligned_vector = std::vector<K, AlignedAllocator<K, ALIGN>>; |
| 160 | +template <typename K> using aligned_vector = std::vector<K, AlignedAllocator<K, ALIGN>>; |
112 | 161 |
|
113 | 162 | /** |
114 | 163 | * @brief Equality operator for AlignedAllocator. |
115 | | - * |
| 164 | + * |
116 | 165 | * Always returns true as the allocator is stateless and does not manage |
117 | 166 | * any per-instance resources. |
118 | | - * |
| 167 | + * |
119 | 168 | * @tparam T Type of allocated elements. |
120 | 169 | * @tparam Alignment Alignment in bytes. |
121 | | - * |
| 170 | + * |
122 | 171 | * @return true |
123 | 172 | */ |
124 | 173 | template <typename T, std::size_t Alignment> |
125 | | -bool operator==(const AlignedAllocator<T, Alignment>&, const AlignedAllocator<T, Alignment>&) noexcept { |
| 174 | +bool operator==(const AlignedAllocator<T, Alignment> &, |
| 175 | + const AlignedAllocator<T, Alignment> &) noexcept { |
126 | 176 | return true; |
127 | 177 | } |
128 | 178 | /** |
129 | 179 | * @brief Inequality operator for AlignedAllocator. |
130 | | - * |
| 180 | + * |
131 | 181 | * Always returns false, as there are no distinguishing stateful properties. |
132 | | - * |
| 182 | + * |
133 | 183 | * @tparam T Type of allocated elements. |
134 | 184 | * @tparam Alignment Alignment in bytes. |
135 | | - * |
| 185 | + * |
136 | 186 | * @return false |
137 | 187 | */ |
138 | 188 | template <typename T, std::size_t Alignment> |
139 | | -bool operator!=(const AlignedAllocator<T, Alignment>&, const AlignedAllocator<T, Alignment>&) noexcept { |
| 189 | +bool operator!=(const AlignedAllocator<T, Alignment> &, |
| 190 | + const AlignedAllocator<T, Alignment> &) noexcept { |
140 | 191 | return false; |
141 | 192 | } |
142 | | - |
|
0 commit comments