Skip to content

Commit afa8631

Browse files
committed
ARM Neon on tha way
1 parent 47e998d commit afa8631

File tree

7 files changed

+1356
-1041
lines changed

7 files changed

+1356
-1041
lines changed

CMakeLists.txt

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
cmake_minimum_required(VERSION 3.22)
2-
project(Tensorium LANGUAGES CXX)
2+
project(Tensorium LANGUAGES C CXX)
33

4+
# ─────────────────────────────── Options ───────────────────────────────
45
option(USE_MPI "Enable MPI support" OFF)
56
option(USE_KNL "Tune for Intel KNL" OFF)
67
option(AVX512 "Enable AVX-512" OFF)
@@ -14,18 +15,28 @@ set(CMAKE_CXX_STANDARD 17)
1415
set(CMAKE_CXX_STANDARD_REQUIRED ON)
1516
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
1617

17-
# Header-only include path
18+
# ─────────────────────────────── Includes ──────────────────────────────
1819
include_directories(${CMAKE_SOURCE_DIR}/Includes)
1920

20-
# Global compiler flags
21-
set(BASE_FLAGS "-O3 -fopenmp -mtune=native -Wno-ignored-attributes -Rpass-analysis=tensorium-align")
22-
set(AVX2_FLAGS "-mfma -mavx2")
23-
set(AVX512_FLAGS "-mfma -mavx512f -mavx512cd")
21+
# ─────────────────────────────── CPU architecture flags ───────────────────────────────
22+
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "amd64")
23+
message(STATUS "Configuring for x86_64: enabling AVX2/FMA or AVX512")
24+
set(BASE_FLAGS "-O3 -mtune=native -Wno-ignored-attributes -Rpass-analysis=tensorium-align")
25+
set(AVX2_FLAGS "-mfma -mavx2")
26+
set(AVX512_FLAGS "-mfma -mavx512f -mavx512cd")
27+
if(AVX512)
28+
set(CMAKE_CXX_FLAGS "${BASE_FLAGS} ${AVX512_FLAGS}")
29+
else()
30+
set(CMAKE_CXX_FLAGS "${BASE_FLAGS} ${AVX2_FLAGS}")
31+
endif()
2432

25-
if(AVX512)
26-
set(CMAKE_CXX_FLAGS "${BASE_FLAGS} ${AVX512_FLAGS}")
33+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
34+
message(STATUS "Configuring for Apple Silicon ARM64: disabling AVX flags")
35+
# Apple Clang n’a pas de support AVX/FMA → on reste sur les optimisations génériques ARM
36+
set(CMAKE_CXX_FLAGS "-O3 -mcpu=apple-m1 -Wno-ignored-attributes")
2737
else()
28-
set(CMAKE_CXX_FLAGS "${BASE_FLAGS} ${AVX2_FLAGS}")
38+
message(WARNING "Unknown architecture (${CMAKE_SYSTEM_PROCESSOR}); using generic optimization flags.")
39+
set(CMAKE_CXX_FLAGS "-O3 -mtune=native")
2940
endif()
3041

3142
if(DEBUG)
@@ -35,7 +46,33 @@ if(VERBOSE)
3546
add_compile_definitions(VERBOSE)
3647
endif()
3748

38-
# Subdirectories
49+
50+
51+
# ─────────────────────────────── Architecture detection ───────────────────────────────
52+
message(STATUS "Detected architecture: ${CMAKE_SYSTEM_PROCESSOR}")
53+
54+
# ─────────────────────────────── OpenMP handling ───────────────────────────────
55+
if(APPLE)
56+
if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
57+
message(STATUS "Configuring OpenMP manually for macOS ARM64 (Apple Clang)")
58+
set(OPENMP_INCLUDE_PATH "/opt/homebrew/opt/libomp/include")
59+
set(OPENMP_LIB_PATH "/opt/homebrew/opt/libomp/lib")
60+
set(OPENMP_LIB "omp")
61+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
62+
message(STATUS "Using OpenMP for macOS x86_64 (Intel)")
63+
find_package(OpenMP)
64+
if(OpenMP_CXX_FOUND)
65+
set(OPENMP_LIB "OpenMP::OpenMP_CXX")
66+
endif()
67+
endif()
68+
else()
69+
find_package(OpenMP)
70+
if(OpenMP_CXX_FOUND)
71+
set(OPENMP_LIB "OpenMP::OpenMP_CXX")
72+
endif()
73+
endif()
74+
75+
# ─────────────────────────────── Subdirectories ───────────────────────────────
3976
if(BUILD_PLUGINS)
4077
add_subdirectory(Plugins)
4178
endif()

Plugins/CMakeLists.txt

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,13 @@
1-
21
find_package(LLVM REQUIRED CONFIG)
32
include_directories(${LLVM_INCLUDE_DIRS})
43
add_definitions(${LLVM_DEFINITIONS})
54

6-
# Clang AST plugin
5+
# AST plugin
76
add_library(TensoriumDispatchPlugin SHARED TensoriumDispatchPlugin.cpp)
87
target_link_libraries(TensoriumDispatchPlugin PRIVATE clang-cpp LLVM)
8+
set_target_properties(TensoriumDispatchPlugin PROPERTIES COMPILE_FLAGS "")
99

10-
# LLVM IR pass
10+
# LLVM IR plugin
1111
add_library(TensoriumLLVM_IRCheck SHARED TensoriumLLVM_IRCheck.cpp)
1212
target_link_libraries(TensoriumLLVM_IRCheck PRIVATE LLVM)
13-
14-
# MLIR plugin (if present)
15-
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/MLIR/CMakeLists.txt)
16-
add_subdirectory(MLIR)
17-
endif()
13+
set_target_properties(TensoriumLLVM_IRCheck PROPERTIES COMPILE_FLAGS "")

Pybind/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
2+
find_package(pybind11 REQUIRED)
3+
pybind11_add_module(tensorium_bindings bindings.cpp)
4+
target_include_directories(tensorium_bindings PRIVATE ${CMAKE_SOURCE_DIR}/Includes)

Tests/CMakeLists.txt

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
2+
file(GLOB TEST_SOURCES
3+
${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
4+
${CMAKE_CURRENT_SOURCE_DIR}/Matrix/*.cpp
5+
${CMAKE_CURRENT_SOURCE_DIR}/Vector/*.cpp
6+
${CMAKE_CURRENT_SOURCE_DIR}/Derivatives/*.cpp
7+
${CMAKE_CURRENT_SOURCE_DIR}/LinearSystems/*.cpp
8+
${CMAKE_CURRENT_SOURCE_DIR}/Tensor/*.cpp
9+
)
10+
11+
add_executable(TensoriumTests ${TEST_SOURCES})
12+
13+
# --- OpenMP flags selon architecture ---
14+
if(APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
15+
include_directories(${OPENMP_INCLUDE_PATH})
16+
link_directories(${OPENMP_LIB_PATH})
17+
target_compile_options(TensoriumTests PRIVATE -Xpreprocessor -fopenmp)
18+
target_link_libraries(TensoriumTests PRIVATE ${OPENMP_LIB})
19+
elseif(DEFINED OPENMP_LIB)
20+
target_link_libraries(TensoriumTests PRIVATE ${OPENMP_LIB})
21+
endif()
22+
23+
# --- MPI / KNL ---
24+
if(USE_MPI)
25+
find_package(MPI REQUIRED)
26+
target_link_libraries(TensoriumTests PRIVATE MPI::MPI_CXX)
27+
target_compile_definitions(TensoriumTests PRIVATE MORPHEUS_USE_MPI)
28+
endif()
29+
30+
if(USE_KNL)
31+
target_compile_options(TensoriumTests PRIVATE -mtune=knl -mfma -mavx512f -mavx512cd)
32+
target_compile_definitions(TensoriumTests PRIVATE USE_KNL)
33+
target_link_libraries(TensoriumTests PRIVATE memkind)
34+
endif()
35+

includes/Tensorium/Core/MatrixKernels/GemmKernel_bigger.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#include <algorithm>
66
#include <cstdlib>
77
#include <cstring>
8-
#include <immintrin.h>
98
/*
109
* this Gemm kernel is based on Aman Salykov version. Improvment of the OMP schedulding and Block
1110
* sizes

includes/Tensorium/SIMD/CPU_id.hpp

Lines changed: 59 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,69 @@
1-
#pragma once
1+
#pragma once
22

3-
#include <cpuid.h>
43
#include <cstring>
5-
#include <string>
64
#include <iostream>
5+
#include <string>
6+
7+
#if defined(__APPLE__)
8+
# include <sys/sysctl.h>
9+
#endif
10+
#if defined(__x86_64__) || defined(_M_X64)
11+
# include <cpuid.h>
12+
# define TENSORIUM_X86 1
13+
#elif defined(__aarch64__) || defined(__arm64__)
14+
# define TENSORIUM_ARM 1
15+
#else
16+
# define TENSORIUM_FALLBACK 1
17+
#endif
718

19+
// ─────────────────────────────── CPU Brand ───────────────────────────────
820
inline std::string get_cpu_brand() {
9-
char brand[0x40] = {0};
10-
unsigned int regs[4] = {0};
11-
for (int i = 0; i < 3; ++i) {
12-
__cpuid(0x80000002 + i, regs[0], regs[1], regs[2], regs[3]);
13-
std::memcpy(brand + i * 16, regs, sizeof(regs));
14-
}
15-
return std::string(brand);
21+
#if defined(TENSORIUM_X86)
22+
char brand[0x40] = {0};
23+
unsigned int regs[4] = {0};
24+
for (int i = 0; i < 3; ++i) {
25+
__cpuid(0x80000002 + i, regs[0], regs[1], regs[2], regs[3]);
26+
std::memcpy(brand + i * 16, regs, sizeof(regs));
27+
}
28+
return std::string(brand);
29+
#elif defined(TENSORIUM_ARM)
30+
// Apple Silicon / ARM64 fallback
31+
// cf. /proc/cpuinfo (Linux) ou sysctl hw.model (macOS)
32+
# if defined(__APPLE__)
33+
char buffer[128];
34+
size_t size = sizeof(buffer);
35+
if (sysctlbyname("machdep.cpu.brand_string", &buffer, &size, NULL, 0) == 0)
36+
return std::string(buffer);
37+
if (sysctlbyname("hw.model", &buffer, &size, NULL, 0) == 0)
38+
return std::string(buffer);
39+
return "Apple ARM CPU";
40+
# else
41+
return "Generic ARM CPU";
42+
# endif
43+
#else
44+
return "Unknown CPU";
45+
#endif
1646
}
1747

48+
// ─────────────────────────────── Block size heuristic ───────────────────────────────
1849
inline size_t detect_optimal_block_size() {
19-
std::string brand = get_cpu_brand();
50+
std::string brand = get_cpu_brand();
2051

21-
if (brand.find("Xeon Phi") != std::string::npos) return 256;
22-
if (brand.find("Xeon") != std::string::npos) return 128;
23-
if (brand.find("Ryzen") != std::string::npos) return 96;
24-
if (brand.find("Apple") != std::string::npos) return 64;
25-
if (brand.find("Core(TM)") != std::string::npos) return 128;
26-
std::cout << "Unknown CPU brand. Defaulting to 64." << std::endl;
27-
return 64;
28-
}
52+
if (brand.find("Xeon Phi") != std::string::npos)
53+
return 256;
54+
if (brand.find("Xeon") != std::string::npos)
55+
return 128;
56+
if (brand.find("Ryzen") != std::string::npos)
57+
return 96;
58+
if (brand.find("Apple") != std::string::npos)
59+
return 64;
60+
if (brand.find("Core(TM)") != std::string::npos)
61+
return 128;
2962

63+
#if defined(TENSORIUM_ARM)
64+
return 64; // safe default for M1/M2
65+
#else
66+
std::cout << "[detect_optimal_block_size] Unknown CPU brand. Defaulting to 64.\n";
67+
return 64;
68+
#endif
69+
}

0 commit comments

Comments
 (0)