Skip to content

Commit fcad3f5

Browse files
committed
Modified allocator to ensure numa node allocations
1 parent 1a039f1 commit fcad3f5

File tree

7 files changed

+169
-95
lines changed

7 files changed

+169
-95
lines changed

CMakeLists.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,16 @@ else()
2929
add_compile_definitions(TENSORIUM_FALLBACK)
3030
endif()
3131

32+
find_library(NUMA_LIB numa)
33+
if (NUMA_LIB)
34+
message(STATUS "→ libnuma detected: enabling NUMA-aware allocation")
35+
add_compile_definitions(USE_NUMA)
36+
set(HAVE_NUMA TRUE)
37+
else()
38+
message(WARNING "libnuma not found: NUMA support disabled")
39+
set(HAVE_NUMA FALSE)
40+
endif()
41+
3242
find_package(CUDAToolkit QUIET)
3343

3444
execute_process(
@@ -121,6 +131,10 @@ else()
121131
endif()
122132
endif()
123133

134+
if (HAVE_NUMA)
135+
message(STATUS "→ Linking libnuma to all targets (NUMA-aware allocation enabled)")
136+
link_libraries(${NUMA_LIB})
137+
endif()
124138
if(BUILD_PLUGINS)
125139
add_subdirectory(Plugins)
126140
endif()

Tools/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
2+
add_executable(tensorium-fdump
3+
tools/FunctionAnalyzer/main.cpp
4+
tools/FunctionAnalyzer/Analyzer.cpp
5+
tools/FunctionAnalyzer/Register.cpp
6+
)
7+
8+
target_link_libraries(tensorium-fdump PRIVATE tensorium_core LLVM)

Tools/FunctionAnalyser/Analyzer.cpp

Whitespace-only changes.

Tools/FunctionAnalyser/Register.cpp

Whitespace-only changes.

Tools/FunctionAnalyser/main.cpp

Whitespace-only changes.

includes/Tensorium/Core/MatrixKernels/GemmKernel_bigger.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <cstdlib>
77
#include <cstring>
88
#include <omp.h>
9+
#include <cstdint>
910
/*
1011
* this Gemm kernel is based on Aman Salykov version. Improvment of the OMP schedulding and Block
1112
* sizes
@@ -826,7 +827,8 @@ template <typename T> class GemmKernelBigger {
826827
pack_panelA(&A[i], &blockA_packed[i * kc], mr, kc, M);
827828
}
828829
}
829-
void matmul(T *A, T *B, T *C, int M, int N, int K) {
830+
void matmul(T *A, T *B, T *C, int M, int N, int K) {
831+
__asm volatile("# LLVM-MCA-BEGIN foo":::"memory");
830832
# pragma omp parallel
831833
{
832834
int tid = omp_get_thread_num();

includes/Tensorium/SIMD/Allocator.hpp

Lines changed: 144 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1,142 +1,192 @@
11
#pragma once
22
#include "SIMD.hpp"
3-
#include <iostream>
43
#include <cmath>
4+
#include <iostream>
55
#include <vector>
6-
6+
#ifdef USE_NUMA
7+
# include <numa.h>
8+
# include <numaif.h>
9+
# include <sched.h> // for sched_getcpu()
10+
#endif
711
#if defined(USE_KNL)
8-
#include <hbwmalloc.h>
12+
# include <hbwmalloc.h>
913
#endif
1014
/**
1115
* @brief Aligned memory allocator for high-performance computing.
12-
*
13-
* This allocator provides memory aligned to a specified boundary, ensuring compatibility with SIMD instructions and optimal cache usage.
14-
* On Intel KNL (Knights Landing) architectures, this allocator automatically uses the high-bandwidth MCDRAM via `hbw_posix_memalign`
15-
* if the macro `USE_KNL` is defined. Otherwise, standard `posix_memalign` is used.
16-
*
17-
* This is particularly useful for vectorized numerical libraries, where memory alignment is essential for
18-
* instruction-level parallelism (e.g., AVX, SSE, AVX-512).
19-
*
16+
*
17+
* This allocator provides memory aligned to a specified boundary, ensuring compatibility with SIMD
18+
* instructions and optimal cache usage. On Intel KNL (Knights Landing) architectures, this
19+
* allocator automatically uses the high-bandwidth MCDRAM via `hbw_posix_memalign` if the macro
20+
* `USE_KNL` is defined. Otherwise, standard `posix_memalign` is used.
21+
*
22+
* This is particularly useful for vectorized numerical libraries, where memory alignment is
23+
* essential for instruction-level parallelism (e.g., AVX, SSE, AVX-512).
24+
*
2025
* @tparam T Type of the objects being allocated.
21-
* @tparam Alignment Memory alignment in bytes. Must be a power of two and compatible with the ISA in use (e.g., 32 for AVX256).
26+
* @tparam Alignment Memory alignment in bytes. Must be a power of two and compatible with the ISA
27+
* in use (e.g., 32 for AVX256).
2228
*/
23-
template <typename T, std::size_t Alignment>
24-
struct AlignedAllocator {
25-
using value_type = T;
26-
using pointer = T*;
27-
using const_pointer = const T*;
28-
using reference = T&;
29-
using const_reference = const T&;
30-
using size_type = std::size_t;
31-
using difference_type = std::ptrdiff_t;
29+
template <typename T, std::size_t Alignment> struct AlignedAllocator {
30+
using value_type = T;
31+
using pointer = T *;
32+
using const_pointer = const T *;
33+
using reference = T &;
34+
using const_reference = const T &;
35+
using size_type = std::size_t;
36+
using difference_type = std::ptrdiff_t;
37+
38+
/**
39+
* @brief Rebinding structure for allocator traits.
40+
*
41+
* Allows conversion of an AlignedAllocator<T, Alignment> to AlignedAllocator<U, Alignment>,
42+
* which is required by STL containers during type conversions.
43+
*
44+
* @tparam U New type for rebind.
45+
*/
46+
template <typename U> struct rebind {
47+
using other = AlignedAllocator<U, Alignment>;
48+
};
49+
/**
50+
* @brief Conversion constructor from another allocator of different type.
51+
*
52+
* Required by the STL allocator model. Does nothing as this allocator is stateless.
53+
*
54+
* @tparam U Other type.
55+
* @param other The other allocator.
56+
*/
57+
AlignedAllocator() noexcept = default;
58+
template <typename U>
59+
/**
60+
* @brief Default constructor.
61+
*
62+
* Stateless and noexcept.
63+
*/
64+
AlignedAllocator(const AlignedAllocator<U, Alignment> &) noexcept {}
65+
66+
/**
67+
* @brief Allocates aligned memory for n elements of type T.
68+
*
69+
* The alignment is guaranteed to be at least `Alignment` bytes. On KNL platforms,
70+
* high-bandwidth memory (HBM) will be used via libhbw.
71+
*
72+
* On NUMA systems, memory is allocated on the node local to the calling thread.
73+
*
74+
* @param n Number of elements to allocate.
75+
* @param node_id NUMA node to allocate on (-1 = auto/local or no NUMA).
76+
* @return T* Pointer to aligned memory block.
77+
*
78+
* @throws std::bad_alloc If memory allocation fails.
79+
*/
80+
[[nodiscard]] T *allocate(std::size_t n, int node_id = -1) {
81+
void *ptr = nullptr;
82+
const std::size_t alloc_size = n * sizeof(T) + Alignment;
3283

33-
/**
34-
* @brief Rebinding structure for allocator traits.
35-
*
36-
* Allows conversion of an AlignedAllocator<T, Alignment> to AlignedAllocator<U, Alignment>,
37-
* which is required by STL containers during type conversions.
38-
*
39-
* @tparam U New type for rebind.
40-
*/
41-
template <typename U>
42-
struct rebind {
43-
using other = AlignedAllocator<U, Alignment>;
44-
};
45-
/**
46-
* @brief Conversion constructor from another allocator of different type.
47-
*
48-
* Required by the STL allocator model. Does nothing as this allocator is stateless.
49-
*
50-
* @tparam U Other type.
51-
* @param other The other allocator.
52-
*/
53-
AlignedAllocator() noexcept = default;
54-
template <typename U>
55-
/**
56-
* @brief Default constructor.
57-
*
58-
* Stateless and noexcept.
59-
*/
60-
AlignedAllocator(const AlignedAllocator<U, Alignment>&) noexcept {}
61-
/**
62-
* @brief Allocates aligned memory for n elements of type T.
63-
*
64-
* The alignment is guaranteed to be at least `Alignment` bytes. On KNL platforms,
65-
* high-bandwidth memory (HBM) will be used via libhbw.
66-
*
67-
* @param n Number of elements to allocate.
68-
* @return T* Pointer to aligned memory block.
69-
*
70-
* @throws std::bad_alloc If memory allocation fails.
71-
*/
72-
[[nodiscard]] T* allocate(std::size_t n) {
73-
void* ptr = nullptr;
74-
const std::size_t alloc_size = n * sizeof(T) + Alignment;
84+
static_assert((Alignment & (Alignment - 1)) == 0, "Alignment must be power of two.");
85+
static_assert(Alignment >= alignof(T), "Alignment must be >= alignof(T).");
7586

7687
#if defined(USE_KNL)
77-
if (hbw_posix_memalign(&ptr, alignment, alloc_size) != 0)
78-
throw std::bad_alloc();
88+
if (hbw_posix_memalign(&ptr, Alignment, alloc_size) != 0)
89+
throw std::bad_alloc();
90+
91+
#elif defined(USE_NUMA)
92+
if (numa_available() >= 0) {
93+
int target_node = node_id;
94+
if (target_node < 0) {
95+
int cpu = sched_getcpu();
96+
target_node = numa_node_of_cpu(cpu);
97+
}
98+
99+
long node_mem = numa_node_size64(target_node, nullptr);
100+
if (node_mem <= 0) {
101+
for (int n = 0; n <= numa_max_node(); ++n) {
102+
if (numa_node_size64(n, nullptr) > 0) {
103+
target_node = n;
104+
break;
105+
}
106+
}
107+
// std::cerr << "[NUMA WARN] Node " << target_node
108+
// << " has no memory, fallback to node " << target_node << "\n";
109+
}
110+
111+
ptr = numa_alloc_onnode(alloc_size, target_node);
112+
if (!ptr) {
113+
std::cerr << "[NUMA WARN] numa_alloc_onnode failed, fallback to posix_memalign()\n";
114+
if (posix_memalign(&ptr, Alignment, alloc_size) != 0)
115+
throw std::bad_alloc();
116+
}
117+
} else if (posix_memalign(&ptr, Alignment, alloc_size) != 0) {
118+
throw std::bad_alloc();
119+
}
120+
79121
#else
80-
if (posix_memalign(&ptr, Alignment, alloc_size) != 0)
81-
throw std::bad_alloc();
122+
if (posix_memalign(&ptr, Alignment, alloc_size) != 0)
123+
throw std::bad_alloc();
82124
#endif
83-
return reinterpret_cast<T*>(ptr);
84-
}
85-
/**
86-
* @brief Deallocates memory previously allocated with allocate().
87-
*
88-
* On KNL platforms, this calls `hbw_free`. Otherwise, standard `free` is used.
89-
*
90-
* @param p Pointer to memory to deallocate.
91-
* @param size Number of elements (not used).
92-
*/
93-
void deallocate(T* p, std::size_t) noexcept {
125+
return reinterpret_cast<T *>(ptr);
126+
}
127+
128+
/**
129+
* @brief Deallocates memory previously allocated with allocate().
130+
*
131+
* On KNL platforms, this calls `hbw_free`. On NUMA systems, it uses `numa_free()`. Otherwise,
132+
* standard `free()` is used.
133+
*
134+
* @param p Pointer to memory to deallocate.
135+
* @param n Number of elements.
136+
*/
137+
void deallocate(T *p, std::size_t n) noexcept {
94138
#if defined(USE_KNL)
95-
hbw_free(p);
139+
hbw_free(p);
140+
#elif defined(USE_NUMA)
141+
if (numa_available() >= 0)
142+
numa_free(p, n * sizeof(T));
143+
else
144+
free(p);
96145
#else
97-
free(p);
146+
free(p);
98147
#endif
99-
}
148+
}
100149
};
150+
101151
/**
102152
* @brief Type alias for a std::vector with aligned memory allocation.
103-
*
153+
*
104154
* This provides an aligned vector container compatible with SIMD usage.
105155
* The alignment used is determined by the macro `ALIGN`, typically set
106156
* based on the SIMD instruction set width (e.g., 16 for SSE, 32 for AVX, 64 for AVX-512).
107-
*
157+
*
108158
* @tparam K Type of the elements.
109159
*/
110-
template<typename K>
111-
using aligned_vector = std::vector<K, AlignedAllocator<K, ALIGN>>;
160+
template <typename K> using aligned_vector = std::vector<K, AlignedAllocator<K, ALIGN>>;
112161

113162
/**
114163
* @brief Equality operator for AlignedAllocator.
115-
*
164+
*
116165
* Always returns true as the allocator is stateless and does not manage
117166
* any per-instance resources.
118-
*
167+
*
119168
* @tparam T Type of allocated elements.
120169
* @tparam Alignment Alignment in bytes.
121-
*
170+
*
122171
* @return true
123172
*/
124173
template <typename T, std::size_t Alignment>
125-
bool operator==(const AlignedAllocator<T, Alignment>&, const AlignedAllocator<T, Alignment>&) noexcept {
174+
bool operator==(const AlignedAllocator<T, Alignment> &,
175+
const AlignedAllocator<T, Alignment> &) noexcept {
126176
return true;
127177
}
128178
/**
129179
* @brief Inequality operator for AlignedAllocator.
130-
*
180+
*
131181
* Always returns false, as there are no distinguishing stateful properties.
132-
*
182+
*
133183
* @tparam T Type of allocated elements.
134184
* @tparam Alignment Alignment in bytes.
135-
*
185+
*
136186
* @return false
137187
*/
138188
template <typename T, std::size_t Alignment>
139-
bool operator!=(const AlignedAllocator<T, Alignment>&, const AlignedAllocator<T, Alignment>&) noexcept {
189+
bool operator!=(const AlignedAllocator<T, Alignment> &,
190+
const AlignedAllocator<T, Alignment> &) noexcept {
140191
return false;
141192
}
142-

0 commit comments

Comments
 (0)