Skip to content

Commit 86cb18c

Browse files
committed
arm aarch64 finaly compile and work fine
1 parent fcad3f5 commit 86cb18c

File tree

4 files changed

+229
-187
lines changed

4 files changed

+229
-187
lines changed

CMakeLists.txt

Lines changed: 25 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -10,32 +10,39 @@ option(BUILD_PLUGINS "Build Clang/LLVM plugins" ON)
1010
option(BUILD_TESTS "Build test executables" ON)
1111
option(BUILD_PYBIND "Build Python bindings" OFF)
1212
option(USE_CUDA "Enable CUDA kernels (requires nvcc)" OFF)
13+
option(USE_CBLAS "Enable BLAS" ON)
1314

1415
set(CMAKE_CXX_STANDARD 17)
1516
set(CMAKE_CXX_STANDARD_REQUIRED ON)
1617
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
1718

18-
message(STATUS "Detecting architecture...")
19+
if(APPLE)
20+
list(APPEND CMAKE_PREFIX_PATH "/opt/local" "/opt/local/libexec/llvm-20")
21+
include_directories("/opt/local/include")
22+
link_directories("/opt/local/lib")
23+
link_directories("/opt/local/libexec/llvm-20/lib")
24+
25+
find_library(ACCELERATE_LIB Accelerate)
26+
if(ACCELERATE_LIB)
27+
link_libraries(${ACCELERATE_LIB})
28+
add_compile_definitions(TENSORIUM_USE_CBLAS)
29+
endif()
30+
endif()
1931

2032
if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
21-
message(STATUS "→ Building for x86_64 (AVX/AVX2/AVX512 enabled)")
2233
add_compile_definitions(TENSORIUM_X86)
2334
add_compile_options(-mavx2 -mfma)
2435
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
25-
message(STATUS "→ Building for ARM64 / Apple Silicon (NEON enabled)")
2636
add_compile_definitions(TENSORIUM_ARM)
2737
else()
28-
message(WARNING "→ Unknown architecture: ${CMAKE_SYSTEM_PROCESSOR}, using scalar fallback.")
2938
add_compile_definitions(TENSORIUM_FALLBACK)
3039
endif()
3140

3241
find_library(NUMA_LIB numa)
3342
if (NUMA_LIB)
34-
message(STATUS "→ libnuma detected: enabling NUMA-aware allocation")
3543
add_compile_definitions(USE_NUMA)
3644
set(HAVE_NUMA TRUE)
3745
else()
38-
message(WARNING "libnuma not found: NUMA support disabled")
3946
set(HAVE_NUMA FALSE)
4047
endif()
4148

@@ -47,24 +54,18 @@ execute_process(
4754
)
4855

4956
if (NOT GPU_NAME STREQUAL "none")
50-
message(STATUS "Detected NVIDIA GPU: ${GPU_NAME}")
5157
add_compile_definitions(TENSORIUM_GPU_PRESENT)
52-
else()
53-
message(WARNING "No NVIDIA GPU detected or nvidia-smi missing")
5458
endif()
5559

5660
if (USE_CUDA)
5761
if (NOT CUDAToolkit_FOUND)
5862
message(FATAL_ERROR "USE_CUDA=ON but CUDA toolkit not found")
5963
endif()
6064

61-
message(STATUS "CUDA toolkit found at: ${CUDAToolkit_ROOT}")
62-
message(STATUS "→ Version: ${CUDAToolkit_VERSION}")
6365
add_compile_definitions(TENSORIUM_CUDA)
6466

6567
enable_language(CUDA)
6668
set(CMAKE_CUDA_COMPILER "${CUDAToolkit_NVCC_EXECUTABLE}")
67-
message(STATUS "Using NVCC for CUDA compilation: ${CMAKE_CUDA_COMPILER}")
6869

6970
set(CMAKE_CUDA_ARCHITECTURES 61 70 75 80 86 90)
7071
add_compile_definitions(TENSORIUM_CUDA_ARCH=${CMAKE_CUDA_ARCHITECTURES})
@@ -75,31 +76,25 @@ if (USE_CUDA)
7576

7677
include_directories(${CUDAToolkit_INCLUDE_DIRS})
7778
link_directories(${CUDAToolkit_LIBRARY_DIR})
78-
79-
message(STATUS "→ CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
8079
else()
81-
message(STATUS "CUDA support disabled (USE_CUDA=OFF)")
8280
add_compile_definitions(TENSORIUM_NO_CUDA)
8381
endif()
8482

8583
include_directories(${CMAKE_SOURCE_DIR}/Includes)
8684

8785
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "amd64")
88-
message(STATUS "Configuring for x86_64: enabling AVX2/FMA or AVX512")
8986
set(BASE_FLAGS "-O3 -mtune=native -Wno-ignored-attributes -Rpass-analysis=tensorium-align")
9087
set(AVX2_FLAGS "-mfma -mavx2")
9188
set(AVX512_FLAGS "-mfma -mavx512f -mavx512cd")
9289
if(AVX512)
93-
set(CMAKE_CXX_FLAGS "${BASE_FLAGS} ${AVX512_FLAGS}")
90+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${BASE_FLAGS} ${AVX512_FLAGS}")
9491
else()
95-
set(CMAKE_CXX_FLAGS "${BASE_FLAGS} ${AVX2_FLAGS}")
92+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${BASE_FLAGS} ${AVX2_FLAGS}")
9693
endif()
9794
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
98-
message(STATUS "Configuring for ARM64: disabling AVX flags")
99-
set(CMAKE_CXX_FLAGS "-O3 -mcpu=apple-m1 -Wno-ignored-attributes")
95+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -mcpu=native -Wno-ignored-attributes")
10096
else()
101-
message(WARNING "Unknown architecture (${CMAKE_SYSTEM_PROCESSOR}); using generic optimization flags.")
102-
set(CMAKE_CXX_FLAGS "-O3 -mtune=native")
97+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -mtune=native")
10398
endif()
10499

105100
if(DEBUG)
@@ -109,16 +104,17 @@ if(VERBOSE)
109104
add_compile_definitions(VERBOSE)
110105
endif()
111106

112-
message(STATUS "Detected architecture: ${CMAKE_SYSTEM_PROCESSOR}")
113-
114107
if(APPLE)
115108
if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
116-
message(STATUS "Configuring OpenMP manually for macOS ARM64 (Apple Clang)")
117-
set(OPENMP_INCLUDE_PATH "/opt/homebrew/opt/libomp/include")
118-
set(OPENMP_LIB_PATH "/opt/homebrew/opt/libomp/lib")
109+
set(OPENMP_INCLUDE_PATH "/opt/local/include/libomp")
110+
set(OPENMP_LIB_PATH "/opt/local/lib/libomp")
119111
set(OPENMP_LIB "omp")
112+
113+
include_directories(${OPENMP_INCLUDE_PATH})
114+
link_directories(${OPENMP_LIB_PATH})
115+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xpreprocessor -fopenmp")
116+
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lomp")
120117
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
121-
message(STATUS "Using OpenMP for macOS x86_64 (Intel)")
122118
find_package(OpenMP)
123119
if(OpenMP_CXX_FOUND)
124120
set(OPENMP_LIB "OpenMP::OpenMP_CXX")
@@ -132,9 +128,9 @@ else()
132128
endif()
133129

134130
if (HAVE_NUMA)
135-
message(STATUS "→ Linking libnuma to all targets (NUMA-aware allocation enabled)")
136131
link_libraries(${NUMA_LIB})
137132
endif()
133+
138134
if(BUILD_PLUGINS)
139135
add_subdirectory(Plugins)
140136
endif()

Tests/Matrix/Matrix.cpp

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,18 @@
11
#include "../test.hpp"
22
using namespace tensorium;
33
#include <complex>
4-
#include <cblas.h>
54
#include <fstream>
6-
5+
#include <iostream>
6+
#include <vector>
7+
8+
// --- PROTECTION DU HEADER ---
9+
#ifdef TENSORIUM_USE_CBLAS
10+
# ifdef __APPLE__
11+
# include <Accelerate/Accelerate.h>
12+
# else
13+
# include <cblas.h>
14+
# endif
15+
#endif
716
#define CHECK(expr) \
817
do { \
918
if (!(expr)) { \
@@ -37,22 +46,29 @@ int matrix_bench() {
3746
using namespace tensorium;
3847

3948
std::vector<std::size_t> sizes = {1024, 2048, 4096, 8192, 16384};
40-
std::string csv_path = "matrix_bench_results.csv";
49+
std::string csv_path = "matrix_bench_results.csv";
4150

4251
std::ofstream csv(csv_path);
52+
53+
#ifdef TENSORIUM_USE_CBLAS
4354
csv << "N,Tensorium_GFLOPs,OpenBLAS_GFLOPs,Tensorium_Time(s),OpenBLAS_Time(s),Speedup\n";
55+
#else
56+
csv << "N,Tensorium_GFLOPs,Tensorium_Time(s)\n";
57+
#endif
4458

4559
std::cout << "Benchmarking GEMM performance...\n";
4660

4761
for (std::size_t N : sizes) {
4862
Matrix<float> A(N, N);
4963
Matrix<float> B(N, N);
50-
Matrix<float> C_ref(N, N);
5164
Matrix<float> C_our(N, N);
65+
#ifdef TENSORIUM_USE_CBLAS
66+
Matrix<float> C_ref(N, N);
67+
#endif
5268

5369
#pragma omp parallel
5470
{
55-
std::mt19937 rng(42 + omp_get_thread_num());
71+
std::mt19937 rng(42 + omp_get_thread_num());
5672
std::uniform_real_distribution<float> dist(0.0f, 1.0f);
5773

5874
#pragma omp for schedule(static)
@@ -69,15 +85,16 @@ int matrix_bench() {
6985
C_our = mul_mat(A, B);
7086
auto t1 = std::chrono::high_resolution_clock::now();
7187

72-
double elapsed_our = std::chrono::duration<double>(t1 - t0).count();
88+
double elapsed_our = std::chrono::duration<double>(t1 - t0).count();
7389
long double flops = 2.0L * N * N * N;
74-
double gflops_our = static_cast<double>(flops / 1e9L) / elapsed_our;
90+
double gflops_our = static_cast<double>(flops / 1e9L) / elapsed_our;
7591

7692
std::cout << "[Tensorium GEMM]\n";
7793
std::cout << "Time : " << elapsed_our << " s\n";
7894
std::cout << "GFLOP/s : " << gflops_our << "\n";
7995
std::cout << "Sample C(0,0): " << C_our(0, 0) << "\n";
8096

97+
#ifdef TENSORIUM_USE_CBLAS
8198
auto t2 = std::chrono::high_resolution_clock::now();
8299
cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, (int)N, (int)N, (int)N, 1.0f,
83100
A.data.data(), (int)N, B.data.data(), (int)N, 0.0f, C_ref.data.data(), (int)N);
@@ -97,19 +114,17 @@ int matrix_bench() {
97114
std::cout << "BLAS : " << gflops_blas << " GFLOP/s\n";
98115
std::cout << "Speedup : x" << speedup << "\n";
99116

100-
csv << N << ","
101-
<< gflops_our << ","
102-
<< gflops_blas << ","
103-
<< elapsed_our << ","
104-
<< elapsed_blas << ","
105-
<< speedup << "\n";
117+
csv << N << "," << gflops_our << "," << gflops_blas << "," << elapsed_our << ","
118+
<< elapsed_blas << "," << speedup << "\n";
119+
#else
120+
csv << N << "," << gflops_our << "," << elapsed_our << "\n";
121+
#endif
106122
}
107123

108124
csv.close();
109125
std::cout << "\nResults written to: " << csv_path << "\n";
110126
return 0;
111127
}
112-
113128
int matrix_tests() {
114129
using Mat = Matrix<float>;
115130
using Vec = Vector<float>;
@@ -279,7 +294,7 @@ int matrix_tests() {
279294
Cc.scl(2.0f);
280295
Cc = tensorium::mul_mat(Ac, Bc);
281296
std::cout << "✅ add_mat on complex<float> passed.\n";
282-
matrix_bench();
297+
// matrix_bench();
283298

284299
Mat A2(2, 2);
285300
A2(0, 0) = 1;

Tests/main.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@ using namespace tensorium;
77
int main() {
88
// #pragma tensorium dispatch
99

10-
// deriv_test();
11-
// linear_solver_test();
10+
deriv_test();
11+
linear_solver_test();
1212
matrix_tests();
13-
// tensor_test();
14-
// vector_tests();
15-
// deriv_test_spectral_fft();
13+
tensor_test();
14+
vector_tests();
15+
deriv_test_spectral_fft();
1616
return 0;
1717
}

0 commit comments

Comments
 (0)