Replace CPP content with aarch64 branch content

at0m741 · at0m741 · commit 9f722a0303a5 · 2025-10-26T19:20:49.000+01:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,23 +1,92 @@
+cmake_minimum_required(VERSION 3.22)
+project(Tensorium LANGUAGES C CXX)
 
-cmake_minimum_required(VERSION 3.12)
-project(tensorium_bindings LANGUAGES CXX)
+# ─────────────────────────────── Options ───────────────────────────────
+option(USE_MPI "Enable MPI support" OFF)
+option(USE_KNL "Tune for Intel KNL" OFF)
+option(AVX512 "Enable AVX-512" OFF)
+option(VERBOSE "Verbose runtime logs" OFF)
+option(DEBUG "Build with debug symbols" OFF)
+option(BUILD_PLUGINS "Build Clang/LLVM plugins" ON)
+option(BUILD_TESTS "Build test executables" ON)
+option(BUILD_PYBIND "Build Python bindings" OFF)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-add_compile_options(-O3 -march=native -mavx2 -mfma -Wignored-attributes)
-add_compile_options(-Wno-ignored-attributes)
-find_package(pybind11 REQUIRED)
+message(STATUS "Detecting architecture...")
 
-find_package(OpenMP REQUIRED)
+# ─────────────────────────────── Architecture detection ───────────────────────────────
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
+    message(STATUS "→ Building for x86_64 (AVX/AVX2/AVX512 enabled)")
+    add_compile_definitions(TENSORIUM_X86)
+    add_compile_options(-mavx2 -mfma)
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
+    message(STATUS "→ Building for ARM64 / Apple Silicon (NEON enabled)")
+    add_compile_definitions(TENSORIUM_ARM)
+else()
+    message(WARNING "→ Unknown architecture: ${CMAKE_SYSTEM_PROCESSOR}, using scalar fallback.")
+    add_compile_definitions(TENSORIUM_FALLBACK)
+endif()
 
-include_directories(${CMAKE_SOURCE_DIR}/includes)
+include_directories(${CMAKE_SOURCE_DIR}/Includes)
 
-pybind11_add_module(tensorium ./Pybind/bindings.cpp)
+# ─────────────────────────────── CPU optimization flags ───────────────────────────────
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "amd64")
+  message(STATUS "Configuring for x86_64: enabling AVX2/FMA or AVX512")
+  set(BASE_FLAGS "-O3 -mtune=native -Wno-ignored-attributes -Rpass-analysis=tensorium-align")
+  set(AVX2_FLAGS "-mfma -mavx2")
+  set(AVX512_FLAGS "-mfma -mavx512f -mavx512cd")
+  if(AVX512)
+    set(CMAKE_CXX_FLAGS "${BASE_FLAGS} ${AVX512_FLAGS}")
+  else()
+    set(CMAKE_CXX_FLAGS "${BASE_FLAGS} ${AVX2_FLAGS}")
+  endif()
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+  message(STATUS "Configuring for Apple Silicon ARM64: disabling AVX flags")
+  set(CMAKE_CXX_FLAGS "-O3 -mcpu=apple-m1 -Wno-ignored-attributes")
+else()
+  message(WARNING "Unknown architecture (${CMAKE_SYSTEM_PROCESSOR}); using generic optimization flags.")
+  set(CMAKE_CXX_FLAGS "-O3 -mtune=native")
+endif()
 
-target_link_libraries(tensorium PRIVATE OpenMP::OpenMP_CXX)
-# install(TARGETS tensorium
-#         LIBRARY DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
+if(DEBUG)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
+endif()
+if(VERBOSE)
+  add_compile_definitions(VERBOSE)
+endif()
 
+message(STATUS "Detected architecture: ${CMAKE_SYSTEM_PROCESSOR}")
 
+# ─────────────────────────────── OpenMP handling ───────────────────────────────
+if(APPLE)
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
+    message(STATUS "Configuring OpenMP manually for macOS ARM64 (Apple Clang)")
+    set(OPENMP_INCLUDE_PATH "/opt/homebrew/opt/libomp/include")
+    set(OPENMP_LIB_PATH "/opt/homebrew/opt/libomp/lib")
+    set(OPENMP_LIB "omp")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+    message(STATUS "Using OpenMP for macOS x86_64 (Intel)")
+    find_package(OpenMP)
+    if(OpenMP_CXX_FOUND)
+      set(OPENMP_LIB "OpenMP::OpenMP_CXX")
+    endif()
+  endif()
+else()
+  find_package(OpenMP)
+  if(OpenMP_CXX_FOUND)
+    set(OPENMP_LIB "OpenMP::OpenMP_CXX")
+  endif()
+endif()
+
+# ─────────────────────────────── Plugins ───────────────────────────────
+if(BUILD_PLUGINS)
+  add_subdirectory(Plugins)
+endif()
+
+# ─────────────────────────────── Tests ───────────────────────────────
+if(BUILD_TESTS)
+  add_subdirectory(Tests)
+endif()
diff --git a/Plugins/CMakeLists.txt b/Plugins/CMakeLists.txt
@@ -0,0 +1,23 @@
+find_package(LLVM REQUIRED CONFIG)
+include_directories(${LLVM_INCLUDE_DIRS})
+add_definitions(${LLVM_DEFINITIONS})
+
+# List of components to map to libraries
+set(LLVM_COMPONENTS
+    Core
+    Support
+    IRReader
+    Analysis
+    Passes
+    TransformUtils
+)
+
+llvm_map_components_to_libnames(LLVM_LIBS ${LLVM_COMPONENTS})
+
+# Tensorium Dispatch Plugin (Clang AST)
+add_library(TensoriumDispatchPlugin SHARED TensoriumDispatchPlugin.cpp)
+target_link_libraries(TensoriumDispatchPlugin PRIVATE clang-cpp ${LLVM_LIBS})
+
+# Tensorium LLVM IR Plugin
+add_library(TensoriumLLVM_IRCheck SHARED TensoriumLLVM_IRCheck.cpp)
+target_link_libraries(TensoriumLLVM_IRCheck PRIVATE ${LLVM_LIBS})
diff --git a/Plugins/TensoriumDispatchPlugin.cpp b/Plugins/TensoriumDispatchPlugin.cpp
@@ -6,6 +6,12 @@
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include <iostream>
+//
+// static int _ = []() {
+//     std::cerr << "[TensoriumDispatchPlugin] Plugin loaded into Clang.\n";
+//     return 0;
+// }();
 // #include "LLVM_Handler.hpp"
 /**
  * @file TensoriumPlugin.cpp
@@ -59,7 +65,8 @@ class TensoriumASTConsumer : public ASTConsumer {
         // 		SourceLocation loc = FD->getBeginLoc();
         // 		for (const auto &entry : TensoriumTargetTable) {
         // 			if (Context.getSourceManager().isBeforeInTranslationUnit(entry.loc,
-        // loc)) { 				std::string fname = FD->getNameAsString(); 				llvm::errs() << "[tensorium] Target("
+        // loc)) { 				std::string fname = FD->getNameAsString();
+        // llvm::errs() << "[tensorium] Target("
         // << entry.platform << ", " << entry.isa
         // 					<< ") applies to function " << fname << "\n";
         //
@@ -300,6 +307,16 @@ class TensoriumPluginAction : public PluginASTAction {
 
 } // namespace
 
-/// @brief Register the plugin under the name "tensorium-dispatch"
-static FrontendPluginRegistry::Add<TensoriumPluginAction> X("tensorium-dispatch",
-                                                            "Handle #pragma tensorium directives");
+// @brief Register the plugin under the name "tensorium-dispatch"
+// Important: we avoid double-free issues by wrapping in a ManagedStatic,
+
+#include "clang/Frontend/FrontendPluginRegistry.h"
+
+// …
+
+using clang::FrontendPluginRegistry;
+
+static FrontendPluginRegistry::Add<TensoriumPluginAction>
+    X("tensorium-dispatch", "Handle #pragma tensorium directives");
+
+
diff --git a/Pybind/CMakeLists.txt b/Pybind/CMakeLists.txt
@@ -0,0 +1,4 @@
+
+find_package(pybind11 REQUIRED)
+pybind11_add_module(tensorium_bindings bindings.cpp)
+target_include_directories(tensorium_bindings PRIVATE ${CMAKE_SOURCE_DIR}/Includes)
diff --git a/Pybind/bindings.cpp b/Pybind/bindings.cpp
@@ -1,6 +1,6 @@
-#include "Tensorium/Tensorium.hpp"
+#include "../includes/Tensorium/Tensorium.hpp"
 #include <iostream>
-#include <omp.h>
+// #include <omp.h>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
@@ -0,0 +1,39 @@
+file(GLOB TEST_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/Matrix/*.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/Vector/*.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/Derivatives/*.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/LinearSystems/*.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/Tensor/*.cpp
+)
+
+add_executable(TensoriumTests ${TEST_SOURCES})
+
+if(APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
+  include_directories(${OPENMP_INCLUDE_PATH})
+  link_directories(${OPENMP_LIB_PATH})
+  target_compile_options(TensoriumTests PRIVATE -Xpreprocessor -fopenmp)
+  target_link_libraries(TensoriumTests PRIVATE ${OPENMP_LIB})
+elseif(DEFINED OPENMP_LIB)
+  target_link_libraries(TensoriumTests PRIVATE ${OPENMP_LIB})
+endif()
+
+if(USE_MPI)
+  find_package(MPI REQUIRED)
+  target_link_libraries(TensoriumTests PRIVATE MPI::MPI_CXX)
+  target_compile_definitions(TensoriumTests PRIVATE MORPHEUS_USE_MPI)
+endif()
+
+if(USE_KNL)
+  target_compile_options(TensoriumTests PRIVATE -mtune=knl -mfma -mavx512f -mavx512cd)
+  target_compile_definitions(TensoriumTests PRIVATE USE_KNL)
+  target_link_libraries(TensoriumTests PRIVATE memkind)
+endif()
+# set(DISPATCH_PLUGIN_PATH "${CMAKE_BINARY_DIR}/Plugins/libTensoriumDispatchPlugin.so")
+#
+# add_dependencies(TensoriumTests TensoriumDispatchPlugin)
+#
+# # inject the clang plugin (frontend)
+# set_property(TARGET TensoriumTests APPEND_STRING PROPERTY COMPILE_FLAGS
+#     " -Xclang -load -Xclang ${DISPATCH_PLUGIN_PATH} -Xclang -add-plugin -Xclang tensorium-dispatch"
+# )
diff --git a/Tests/Matrix/Matrix.cpp b/Tests/Matrix/Matrix.cpp
@@ -31,7 +31,7 @@ tensorium::Matrix<K> mul_mat_reference(const tensorium::Matrix<K>& A, const tens
 }
 int matrix_bench() {
 	using namespace tensorium;
-	std::vector<std::size_t> sizes = {256};
+	std::vector<std::size_t> sizes = {8192};
 
 	for (std::size_t N : sizes) {
 		Matrix<double> A(N, N);
diff --git a/Tests/test.hpp b/Tests/test.hpp
@@ -5,12 +5,11 @@
 #include <cmath>
 #include <vector>
 #include <chrono>
-#include <immintrin.h>
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
 #include <algorithm>
-#include <omp.h>
+// #include <omp.h>
 #include <iomanip>
 #include <random>
 
diff --git a/includes/Tensorium/Core/Derivate.hpp b/includes/Tensorium/Core/Derivate.hpp
@@ -8,7 +8,6 @@
 #include "Vector.hpp"
 #include <cassert>
 #include <cmath>
-#include <immintrin.h>
 #include <iostream>
 #include <numeric>
 #include <vector>
diff --git a/includes/Tensorium/Core/Matrix.hpp b/includes/Tensorium/Core/Matrix.hpp
@@ -8,7 +8,6 @@
 #include "Vector.hpp"
 #include <cassert>
 #include <cmath>
-#include <immintrin.h>
 #include <iostream>
 #include <vector>
 
@@ -209,25 +208,51 @@ template <typename K, bool RowMajor = false> class Matrix {
      * Uses blocking and micro-kernels to avoid cache bottleneck with FMA/AVX units repartition.
      * Fast-paths exist for 4×4, 8×8, and 16×16.
      */
+
     inline Matrix _mul_mat(const Matrix<K> &mat) const {
         if (cols != mat.rows)
             throw std::invalid_argument("Matrix dimensions do not match for multiplication");
 
         Matrix<K> result(rows, mat.cols);
 
-        const K *A = data.data();        // Already column-major (this)
-        const K *B = mat.data.data();    // Already column-major (rhs)
-        K       *C = result.data.data(); // Output (also column-major)
+        const K *A = data.data();        // column-major (this)
+        const K *B = mat.data.data();    // column-major (rhs)
+        K       *C = result.data.data(); // column-major output
 
+#if defined(TENSORIUM_X86)
+        // SIMD kernel for x86 (AVX2 / AVX512)
         tensorium::GemmKernelBigger<K> kernel;
         kernel.matmul(const_cast<K *>(A), const_cast<K *>(B), C,
                       static_cast<int>(rows),     // M
                       static_cast<int>(mat.cols), // N
-                      static_cast<int>(cols)      // K
-        );
+                      static_cast<int>(cols));    // K
+
+#elif defined(TENSORIUM_ARM)
+        // Temporary fallback (naïve scalar matmul)
+        for (size_t i = 0; i < rows; ++i) {
+            for (size_t j = 0; j < mat.cols; ++j) {
+                K sum = static_cast<K>(0);
+                for (size_t k = 0; k < cols; ++k)
+                    sum += A[i + k * rows] * B[k + j * mat.rows];
+                C[i + j * rows] = sum;
+            }
+        }
+
+#else
+        // Generic scalar fallback
+        for (size_t i = 0; i < rows; ++i) {
+            for (size_t j = 0; j < mat.cols; ++j) {
+                K sum = static_cast<K>(0);
+                for (size_t k = 0; k < cols; ++k)
+                    sum += A[i + k * rows] * B[k + j * mat.rows];
+                C[i + j * rows] = sum;
+            }
+        }
+#endif
 
         return result;
     }
+
     /**
      * @brief Multiply matrix by a vector using SIMD
      *
@@ -455,37 +480,44 @@ template <typename K, bool RowMajor = false> class Matrix {
         }
 
         return r;
-	}
+    }
 
-	Matrix& operator+=(const Matrix& m) { this->add(m); return *this; }
-	Matrix& operator-=(const Matrix& m) { this->sub(m); return *this; }
-	Matrix& operator*=(K alpha) { this->scl(alpha); return *this; }
+    Matrix &operator+=(const Matrix &m) {
+        this->add(m);
+        return *this;
+    }
+    Matrix &operator-=(const Matrix &m) {
+        this->sub(m);
+        return *this;
+    }
+    Matrix &operator*=(K alpha) {
+        this->scl(alpha);
+        return *this;
+    }
 };
-template<typename K, bool RM>
-Matrix<K, RM> operator+(const Matrix<K, RM>& a, const Matrix<K, RM>& b) {
-	Matrix<K, RM> res = a;
-	res.add(b);
-	return res;
+template <typename K, bool RM>
+Matrix<K, RM> operator+(const Matrix<K, RM> &a, const Matrix<K, RM> &b) {
+    Matrix<K, RM> res = a;
+    res.add(b);
+    return res;
 }
-template<typename K, bool RM>
-Matrix<K, RM> operator-(const Matrix<K, RM>& a, const Matrix<K, RM>& b) {
-	Matrix<K, RM> res = a;
-	res.sub(b);
-	return res;
+template <typename K, bool RM>
+Matrix<K, RM> operator-(const Matrix<K, RM> &a, const Matrix<K, RM> &b) {
+    Matrix<K, RM> res = a;
+    res.sub(b);
+    return res;
 }
-template<typename K, bool RM>
-Matrix<K, RM> operator*(const Matrix<K, RM>& a, const Matrix<K, RM>& b) {
-	return a._mul_mat(b);
+template <typename K, bool RM>
+Matrix<K, RM> operator*(const Matrix<K, RM> &a, const Matrix<K, RM> &b) {
+    return a._mul_mat(b);
 }
-template<typename K, bool RM>
-Matrix<K, RM> operator*(const Matrix<K, RM>& m, K alpha) {
-	Matrix<K, RM> res = m;
-	res.scl(alpha);
-	return res;
+template <typename K, bool RM> Matrix<K, RM> operator*(const Matrix<K, RM> &m, K alpha) {
+    Matrix<K, RM> res = m;
+    res.scl(alpha);
+    return res;
 }
-template<typename K, bool RM>
-Matrix<K, RM> operator*(K alpha, const Matrix<K, RM>& m) {
-	return m * alpha;
+template <typename K, bool RM> Matrix<K, RM> operator*(K alpha, const Matrix<K, RM> &m) {
+    return m * alpha;
 }
 
 } // namespace tensorium
diff --git a/includes/Tensorium/Core/MatrixKernels/GemmKernel_bigger.hpp b/includes/Tensorium/Core/MatrixKernels/GemmKernel_bigger.hpp
diff --git a/includes/Tensorium/Core/Vector.hpp b/includes/Tensorium/Core/Vector.hpp
diff --git a/includes/Tensorium/SIMD/CPU_id.hpp b/includes/Tensorium/SIMD/CPU_id.hpp
diff --git a/includes/Tensorium/SIMD/SIMD.hpp b/includes/Tensorium/SIMD/SIMD.hpp

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ tensorium::Matrix<K> mul_mat_reference(const tensorium::Matrix<K>& A, const tens`
`31`	`31`	`}`
`32`	`32`	`int matrix_bench() {`
`33`	`33`	`using namespace tensorium;`
`34`		`- std::vector<std::size_t> sizes = {256};`
	`34`	`+ std::vector<std::size_t> sizes = {8192};`
`35`	`35`
`36`	`36`	`for (std::size_t N : sizes) {`
`37`	`37`	`Matrix<double> A(N, N);`