Skip to content

Commit 9f722a0

Browse files
committed
Replace CPP content with aarch64 branch content
1 parent dfd9258 commit 9f722a0

File tree

14 files changed

+1591
-1083
lines changed

14 files changed

+1591
-1083
lines changed

CMakeLists.txt

Lines changed: 81 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,92 @@
1+
cmake_minimum_required(VERSION 3.22)
2+
project(Tensorium LANGUAGES C CXX)
13

2-
cmake_minimum_required(VERSION 3.12)
3-
project(tensorium_bindings LANGUAGES CXX)
4+
# ─────────────────────────────── Options ───────────────────────────────
5+
option(USE_MPI "Enable MPI support" OFF)
6+
option(USE_KNL "Tune for Intel KNL" OFF)
7+
option(AVX512 "Enable AVX-512" OFF)
8+
option(VERBOSE "Verbose runtime logs" OFF)
9+
option(DEBUG "Build with debug symbols" OFF)
10+
option(BUILD_PLUGINS "Build Clang/LLVM plugins" ON)
11+
option(BUILD_TESTS "Build test executables" ON)
12+
option(BUILD_PYBIND "Build Python bindings" OFF)
413

514
set(CMAKE_CXX_STANDARD 17)
615
set(CMAKE_CXX_STANDARD_REQUIRED ON)
7-
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
16+
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
817

9-
add_compile_options(-O3 -march=native -mavx2 -mfma -Wignored-attributes)
10-
add_compile_options(-Wno-ignored-attributes)
11-
find_package(pybind11 REQUIRED)
18+
message(STATUS "Detecting architecture...")
1219

13-
find_package(OpenMP REQUIRED)
20+
# ─────────────────────────────── Architecture detection ───────────────────────────────
21+
if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
22+
message(STATUS "→ Building for x86_64 (AVX/AVX2/AVX512 enabled)")
23+
add_compile_definitions(TENSORIUM_X86)
24+
add_compile_options(-mavx2 -mfma)
25+
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
26+
message(STATUS "→ Building for ARM64 / Apple Silicon (NEON enabled)")
27+
add_compile_definitions(TENSORIUM_ARM)
28+
else()
29+
message(WARNING "→ Unknown architecture: ${CMAKE_SYSTEM_PROCESSOR}, using scalar fallback.")
30+
add_compile_definitions(TENSORIUM_FALLBACK)
31+
endif()
1432

15-
include_directories(${CMAKE_SOURCE_DIR}/includes)
33+
include_directories(${CMAKE_SOURCE_DIR}/Includes)
1634

17-
pybind11_add_module(tensorium ./Pybind/bindings.cpp)
35+
# ─────────────────────────────── CPU optimization flags ───────────────────────────────
36+
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "amd64")
37+
message(STATUS "Configuring for x86_64: enabling AVX2/FMA or AVX512")
38+
set(BASE_FLAGS "-O3 -mtune=native -Wno-ignored-attributes -Rpass-analysis=tensorium-align")
39+
set(AVX2_FLAGS "-mfma -mavx2")
40+
set(AVX512_FLAGS "-mfma -mavx512f -mavx512cd")
41+
if(AVX512)
42+
set(CMAKE_CXX_FLAGS "${BASE_FLAGS} ${AVX512_FLAGS}")
43+
else()
44+
set(CMAKE_CXX_FLAGS "${BASE_FLAGS} ${AVX2_FLAGS}")
45+
endif()
46+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
47+
message(STATUS "Configuring for Apple Silicon ARM64: disabling AVX flags")
48+
set(CMAKE_CXX_FLAGS "-O3 -mcpu=apple-m1 -Wno-ignored-attributes")
49+
else()
50+
message(WARNING "Unknown architecture (${CMAKE_SYSTEM_PROCESSOR}); using generic optimization flags.")
51+
set(CMAKE_CXX_FLAGS "-O3 -mtune=native")
52+
endif()
1853

19-
target_link_libraries(tensorium PRIVATE OpenMP::OpenMP_CXX)
20-
# install(TARGETS tensorium
21-
# LIBRARY DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
54+
if(DEBUG)
55+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
56+
endif()
57+
if(VERBOSE)
58+
add_compile_definitions(VERBOSE)
59+
endif()
2260

61+
message(STATUS "Detected architecture: ${CMAKE_SYSTEM_PROCESSOR}")
2362

63+
# ─────────────────────────────── OpenMP handling ───────────────────────────────
64+
if(APPLE)
65+
if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
66+
message(STATUS "Configuring OpenMP manually for macOS ARM64 (Apple Clang)")
67+
set(OPENMP_INCLUDE_PATH "/opt/homebrew/opt/libomp/include")
68+
set(OPENMP_LIB_PATH "/opt/homebrew/opt/libomp/lib")
69+
set(OPENMP_LIB "omp")
70+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
71+
message(STATUS "Using OpenMP for macOS x86_64 (Intel)")
72+
find_package(OpenMP)
73+
if(OpenMP_CXX_FOUND)
74+
set(OPENMP_LIB "OpenMP::OpenMP_CXX")
75+
endif()
76+
endif()
77+
else()
78+
find_package(OpenMP)
79+
if(OpenMP_CXX_FOUND)
80+
set(OPENMP_LIB "OpenMP::OpenMP_CXX")
81+
endif()
82+
endif()
83+
84+
# ─────────────────────────────── Plugins ───────────────────────────────
85+
if(BUILD_PLUGINS)
86+
add_subdirectory(Plugins)
87+
endif()
88+
89+
# ─────────────────────────────── Tests ───────────────────────────────
90+
if(BUILD_TESTS)
91+
add_subdirectory(Tests)
92+
endif()

Plugins/CMakeLists.txt

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
find_package(LLVM REQUIRED CONFIG)
2+
include_directories(${LLVM_INCLUDE_DIRS})
3+
add_definitions(${LLVM_DEFINITIONS})
4+
5+
# List of components to map to libraries
6+
set(LLVM_COMPONENTS
7+
Core
8+
Support
9+
IRReader
10+
Analysis
11+
Passes
12+
TransformUtils
13+
)
14+
15+
llvm_map_components_to_libnames(LLVM_LIBS ${LLVM_COMPONENTS})
16+
17+
# Tensorium Dispatch Plugin (Clang AST)
18+
add_library(TensoriumDispatchPlugin SHARED TensoriumDispatchPlugin.cpp)
19+
target_link_libraries(TensoriumDispatchPlugin PRIVATE clang-cpp ${LLVM_LIBS})
20+
21+
# Tensorium LLVM IR Plugin
22+
add_library(TensoriumLLVM_IRCheck SHARED TensoriumLLVM_IRCheck.cpp)
23+
target_link_libraries(TensoriumLLVM_IRCheck PRIVATE ${LLVM_LIBS})

Plugins/TensoriumDispatchPlugin.cpp

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@
66
#include "clang/Lex/Preprocessor.h"
77
#include "clang/Tooling/Tooling.h"
88
#include "llvm/Support/MemoryBuffer.h"
9+
#include <iostream>
10+
//
11+
// static int _ = []() {
12+
// std::cerr << "[TensoriumDispatchPlugin] Plugin loaded into Clang.\n";
13+
// return 0;
14+
// }();
915
// #include "LLVM_Handler.hpp"
1016
/**
1117
* @file TensoriumPlugin.cpp
@@ -59,7 +65,8 @@ class TensoriumASTConsumer : public ASTConsumer {
5965
// SourceLocation loc = FD->getBeginLoc();
6066
// for (const auto &entry : TensoriumTargetTable) {
6167
// if (Context.getSourceManager().isBeforeInTranslationUnit(entry.loc,
62-
// loc)) { std::string fname = FD->getNameAsString(); llvm::errs() << "[tensorium] Target("
68+
// loc)) { std::string fname = FD->getNameAsString();
69+
// llvm::errs() << "[tensorium] Target("
6370
// << entry.platform << ", " << entry.isa
6471
// << ") applies to function " << fname << "\n";
6572
//
@@ -300,6 +307,16 @@ class TensoriumPluginAction : public PluginASTAction {
300307

301308
} // namespace
302309

303-
/// @brief Register the plugin under the name "tensorium-dispatch"
304-
static FrontendPluginRegistry::Add<TensoriumPluginAction> X("tensorium-dispatch",
305-
"Handle #pragma tensorium directives");
310+
// @brief Register the plugin under the name "tensorium-dispatch"
311+
// Important: we avoid double-free issues by wrapping in a ManagedStatic,
312+
313+
#include "clang/Frontend/FrontendPluginRegistry.h"
314+
315+
//
316+
317+
using clang::FrontendPluginRegistry;
318+
319+
static FrontendPluginRegistry::Add<TensoriumPluginAction>
320+
X("tensorium-dispatch", "Handle #pragma tensorium directives");
321+
322+

Pybind/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
2+
find_package(pybind11 REQUIRED)
3+
pybind11_add_module(tensorium_bindings bindings.cpp)
4+
target_include_directories(tensorium_bindings PRIVATE ${CMAKE_SOURCE_DIR}/Includes)

Pybind/bindings.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
#include "Tensorium/Tensorium.hpp"
1+
#include "../includes/Tensorium/Tensorium.hpp"
22
#include <iostream>
3-
#include <omp.h>
3+
// #include <omp.h>
44
#include <pybind11/numpy.h>
55
#include <pybind11/pybind11.h>
66
#include <pybind11/stl.h>

Tests/CMakeLists.txt

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
file(GLOB TEST_SOURCES
2+
${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
3+
${CMAKE_CURRENT_SOURCE_DIR}/Matrix/*.cpp
4+
${CMAKE_CURRENT_SOURCE_DIR}/Vector/*.cpp
5+
${CMAKE_CURRENT_SOURCE_DIR}/Derivatives/*.cpp
6+
${CMAKE_CURRENT_SOURCE_DIR}/LinearSystems/*.cpp
7+
${CMAKE_CURRENT_SOURCE_DIR}/Tensor/*.cpp
8+
)
9+
10+
add_executable(TensoriumTests ${TEST_SOURCES})
11+
12+
if(APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
13+
include_directories(${OPENMP_INCLUDE_PATH})
14+
link_directories(${OPENMP_LIB_PATH})
15+
target_compile_options(TensoriumTests PRIVATE -Xpreprocessor -fopenmp)
16+
target_link_libraries(TensoriumTests PRIVATE ${OPENMP_LIB})
17+
elseif(DEFINED OPENMP_LIB)
18+
target_link_libraries(TensoriumTests PRIVATE ${OPENMP_LIB})
19+
endif()
20+
21+
if(USE_MPI)
22+
find_package(MPI REQUIRED)
23+
target_link_libraries(TensoriumTests PRIVATE MPI::MPI_CXX)
24+
target_compile_definitions(TensoriumTests PRIVATE MORPHEUS_USE_MPI)
25+
endif()
26+
27+
if(USE_KNL)
28+
target_compile_options(TensoriumTests PRIVATE -mtune=knl -mfma -mavx512f -mavx512cd)
29+
target_compile_definitions(TensoriumTests PRIVATE USE_KNL)
30+
target_link_libraries(TensoriumTests PRIVATE memkind)
31+
endif()
32+
# set(DISPATCH_PLUGIN_PATH "${CMAKE_BINARY_DIR}/Plugins/libTensoriumDispatchPlugin.so")
33+
#
34+
# add_dependencies(TensoriumTests TensoriumDispatchPlugin)
35+
#
36+
# # inject the clang plugin (frontend)
37+
# set_property(TARGET TensoriumTests APPEND_STRING PROPERTY COMPILE_FLAGS
38+
# " -Xclang -load -Xclang ${DISPATCH_PLUGIN_PATH} -Xclang -add-plugin -Xclang tensorium-dispatch"
39+
# )

Tests/Matrix/Matrix.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ tensorium::Matrix<K> mul_mat_reference(const tensorium::Matrix<K>& A, const tens
3131
}
3232
int matrix_bench() {
3333
using namespace tensorium;
34-
std::vector<std::size_t> sizes = {256};
34+
std::vector<std::size_t> sizes = {8192};
3535

3636
for (std::size_t N : sizes) {
3737
Matrix<double> A(N, N);

Tests/test.hpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,11 @@
55
#include <cmath>
66
#include <vector>
77
#include <chrono>
8-
#include <immintrin.h>
98
#include <cassert>
109
#include <cstdlib>
1110
#include <cstring>
1211
#include <algorithm>
13-
#include <omp.h>
12+
// #include <omp.h>
1413
#include <iomanip>
1514
#include <random>
1615

includes/Tensorium/Core/Derivate.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
#include "Vector.hpp"
99
#include <cassert>
1010
#include <cmath>
11-
#include <immintrin.h>
1211
#include <iostream>
1312
#include <numeric>
1413
#include <vector>

includes/Tensorium/Core/Matrix.hpp

Lines changed: 63 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
#include "Vector.hpp"
99
#include <cassert>
1010
#include <cmath>
11-
#include <immintrin.h>
1211
#include <iostream>
1312
#include <vector>
1413

@@ -209,25 +208,51 @@ template <typename K, bool RowMajor = false> class Matrix {
209208
* Uses blocking and micro-kernels to avoid cache bottleneck with FMA/AVX units repartition.
210209
* Fast-paths exist for 4×4, 8×8, and 16×16.
211210
*/
211+
212212
inline Matrix _mul_mat(const Matrix<K> &mat) const {
213213
if (cols != mat.rows)
214214
throw std::invalid_argument("Matrix dimensions do not match for multiplication");
215215

216216
Matrix<K> result(rows, mat.cols);
217217

218-
const K *A = data.data(); // Already column-major (this)
219-
const K *B = mat.data.data(); // Already column-major (rhs)
220-
K *C = result.data.data(); // Output (also column-major)
218+
const K *A = data.data(); // column-major (this)
219+
const K *B = mat.data.data(); // column-major (rhs)
220+
K *C = result.data.data(); // column-major output
221221

222+
#if defined(TENSORIUM_X86)
223+
// SIMD kernel for x86 (AVX2 / AVX512)
222224
tensorium::GemmKernelBigger<K> kernel;
223225
kernel.matmul(const_cast<K *>(A), const_cast<K *>(B), C,
224226
static_cast<int>(rows), // M
225227
static_cast<int>(mat.cols), // N
226-
static_cast<int>(cols) // K
227-
);
228+
static_cast<int>(cols)); // K
229+
230+
#elif defined(TENSORIUM_ARM)
231+
// Temporary fallback (naïve scalar matmul)
232+
for (size_t i = 0; i < rows; ++i) {
233+
for (size_t j = 0; j < mat.cols; ++j) {
234+
K sum = static_cast<K>(0);
235+
for (size_t k = 0; k < cols; ++k)
236+
sum += A[i + k * rows] * B[k + j * mat.rows];
237+
C[i + j * rows] = sum;
238+
}
239+
}
240+
241+
#else
242+
// Generic scalar fallback
243+
for (size_t i = 0; i < rows; ++i) {
244+
for (size_t j = 0; j < mat.cols; ++j) {
245+
K sum = static_cast<K>(0);
246+
for (size_t k = 0; k < cols; ++k)
247+
sum += A[i + k * rows] * B[k + j * mat.rows];
248+
C[i + j * rows] = sum;
249+
}
250+
}
251+
#endif
228252

229253
return result;
230254
}
255+
231256
/**
232257
* @brief Multiply matrix by a vector using SIMD
233258
*
@@ -455,37 +480,44 @@ template <typename K, bool RowMajor = false> class Matrix {
455480
}
456481

457482
return r;
458-
}
483+
}
459484

460-
Matrix& operator+=(const Matrix& m) { this->add(m); return *this; }
461-
Matrix& operator-=(const Matrix& m) { this->sub(m); return *this; }
462-
Matrix& operator*=(K alpha) { this->scl(alpha); return *this; }
485+
Matrix &operator+=(const Matrix &m) {
486+
this->add(m);
487+
return *this;
488+
}
489+
Matrix &operator-=(const Matrix &m) {
490+
this->sub(m);
491+
return *this;
492+
}
493+
Matrix &operator*=(K alpha) {
494+
this->scl(alpha);
495+
return *this;
496+
}
463497
};
464-
template<typename K, bool RM>
465-
Matrix<K, RM> operator+(const Matrix<K, RM>& a, const Matrix<K, RM>& b) {
466-
Matrix<K, RM> res = a;
467-
res.add(b);
468-
return res;
498+
template <typename K, bool RM>
499+
Matrix<K, RM> operator+(const Matrix<K, RM> &a, const Matrix<K, RM> &b) {
500+
Matrix<K, RM> res = a;
501+
res.add(b);
502+
return res;
469503
}
470-
template<typename K, bool RM>
471-
Matrix<K, RM> operator-(const Matrix<K, RM>& a, const Matrix<K, RM>& b) {
472-
Matrix<K, RM> res = a;
473-
res.sub(b);
474-
return res;
504+
template <typename K, bool RM>
505+
Matrix<K, RM> operator-(const Matrix<K, RM> &a, const Matrix<K, RM> &b) {
506+
Matrix<K, RM> res = a;
507+
res.sub(b);
508+
return res;
475509
}
476-
template<typename K, bool RM>
477-
Matrix<K, RM> operator*(const Matrix<K, RM>& a, const Matrix<K, RM>& b) {
478-
return a._mul_mat(b);
510+
template <typename K, bool RM>
511+
Matrix<K, RM> operator*(const Matrix<K, RM> &a, const Matrix<K, RM> &b) {
512+
return a._mul_mat(b);
479513
}
480-
template<typename K, bool RM>
481-
Matrix<K, RM> operator*(const Matrix<K, RM>& m, K alpha) {
482-
Matrix<K, RM> res = m;
483-
res.scl(alpha);
484-
return res;
514+
template <typename K, bool RM> Matrix<K, RM> operator*(const Matrix<K, RM> &m, K alpha) {
515+
Matrix<K, RM> res = m;
516+
res.scl(alpha);
517+
return res;
485518
}
486-
template<typename K, bool RM>
487-
Matrix<K, RM> operator*(K alpha, const Matrix<K, RM>& m) {
488-
return m * alpha;
519+
template <typename K, bool RM> Matrix<K, RM> operator*(K alpha, const Matrix<K, RM> &m) {
520+
return m * alpha;
489521
}
490522

491523
} // namespace tensorium

0 commit comments

Comments
 (0)