does not compile atm, need to fix some issues

at0m741 · at0m741 · commit c1f580e754f3 · 2025-10-20T19:24:12.000+02:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -49,7 +49,6 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "am
 
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
   message(STATUS "Configuring for Apple Silicon ARM64: disabling AVX flags")
-  # Apple Clang n’a pas de support AVX/FMA → on reste sur les optimisations génériques ARM
   set(CMAKE_CXX_FLAGS "-O3 -mcpu=apple-m1 -Wno-ignored-attributes")
 else()
   message(WARNING "Unknown architecture (${CMAKE_SYSTEM_PROCESSOR}); using generic optimization flags.")
diff --git a/Tests/test.hpp b/Tests/test.hpp
@@ -5,7 +5,6 @@
 #include <cmath>
 #include <vector>
 #include <chrono>
-#include <immintrin.h>
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
diff --git a/includes/Tensorium/Core/Derivate.hpp b/includes/Tensorium/Core/Derivate.hpp
@@ -8,7 +8,6 @@
 #include "Vector.hpp"
 #include <cassert>
 #include <cmath>
-#include <immintrin.h>
 #include <iostream>
 #include <numeric>
 #include <vector>
diff --git a/includes/Tensorium/Core/Matrix.hpp b/includes/Tensorium/Core/Matrix.hpp
@@ -8,7 +8,6 @@
 #include "Vector.hpp"
 #include <cassert>
 #include <cmath>
-#include <immintrin.h>
 #include <iostream>
 #include <vector>
 
@@ -209,25 +208,51 @@ template <typename K, bool RowMajor = false> class Matrix {
      * Uses blocking and micro-kernels to avoid cache bottleneck with FMA/AVX units repartition.
      * Fast-paths exist for 4×4, 8×8, and 16×16.
      */
+
     inline Matrix _mul_mat(const Matrix<K> &mat) const {
         if (cols != mat.rows)
             throw std::invalid_argument("Matrix dimensions do not match for multiplication");
 
         Matrix<K> result(rows, mat.cols);
 
-        const K *A = data.data();        // Already column-major (this)
-        const K *B = mat.data.data();    // Already column-major (rhs)
-        K       *C = result.data.data(); // Output (also column-major)
+        const K *A = data.data();        // column-major (this)
+        const K *B = mat.data.data();    // column-major (rhs)
+        K       *C = result.data.data(); // column-major output
 
+#if defined(TENSORIUM_X86)
+        // SIMD kernel for x86 (AVX2 / AVX512)
         tensorium::GemmKernelBigger<K> kernel;
         kernel.matmul(const_cast<K *>(A), const_cast<K *>(B), C,
                       static_cast<int>(rows),     // M
                       static_cast<int>(mat.cols), // N
-                      static_cast<int>(cols)      // K
-        );
+                      static_cast<int>(cols));    // K
+
+#elif defined(TENSORIUM_ARM)
+        // Temporary fallback (naïve scalar matmul)
+        for (size_t i = 0; i < rows; ++i) {
+            for (size_t j = 0; j < mat.cols; ++j) {
+                K sum = static_cast<K>(0);
+                for (size_t k = 0; k < cols; ++k)
+                    sum += A[i + k * rows] * B[k + j * mat.rows];
+                C[i + j * rows] = sum;
+            }
+        }
+
+#else
+        // Generic scalar fallback
+        for (size_t i = 0; i < rows; ++i) {
+            for (size_t j = 0; j < mat.cols; ++j) {
+                K sum = static_cast<K>(0);
+                for (size_t k = 0; k < cols; ++k)
+                    sum += A[i + k * rows] * B[k + j * mat.rows];
+                C[i + j * rows] = sum;
+            }
+        }
+#endif
 
         return result;
     }
+
     /**
      * @brief Multiply matrix by a vector using SIMD
      *
@@ -455,37 +480,44 @@ template <typename K, bool RowMajor = false> class Matrix {
         }
 
         return r;
-	}
+    }
 
-	Matrix& operator+=(const Matrix& m) { this->add(m); return *this; }
-	Matrix& operator-=(const Matrix& m) { this->sub(m); return *this; }
-	Matrix& operator*=(K alpha) { this->scl(alpha); return *this; }
+    Matrix &operator+=(const Matrix &m) {
+        this->add(m);
+        return *this;
+    }
+    Matrix &operator-=(const Matrix &m) {
+        this->sub(m);
+        return *this;
+    }
+    Matrix &operator*=(K alpha) {
+        this->scl(alpha);
+        return *this;
+    }
 };
-template<typename K, bool RM>
-Matrix<K, RM> operator+(const Matrix<K, RM>& a, const Matrix<K, RM>& b) {
-	Matrix<K, RM> res = a;
-	res.add(b);
-	return res;
+template <typename K, bool RM>
+Matrix<K, RM> operator+(const Matrix<K, RM> &a, const Matrix<K, RM> &b) {
+    Matrix<K, RM> res = a;
+    res.add(b);
+    return res;
 }
-template<typename K, bool RM>
-Matrix<K, RM> operator-(const Matrix<K, RM>& a, const Matrix<K, RM>& b) {
-	Matrix<K, RM> res = a;
-	res.sub(b);
-	return res;
+template <typename K, bool RM>
+Matrix<K, RM> operator-(const Matrix<K, RM> &a, const Matrix<K, RM> &b) {
+    Matrix<K, RM> res = a;
+    res.sub(b);
+    return res;
 }
-template<typename K, bool RM>
-Matrix<K, RM> operator*(const Matrix<K, RM>& a, const Matrix<K, RM>& b) {
-	return a._mul_mat(b);
+template <typename K, bool RM>
+Matrix<K, RM> operator*(const Matrix<K, RM> &a, const Matrix<K, RM> &b) {
+    return a._mul_mat(b);
 }
-template<typename K, bool RM>
-Matrix<K, RM> operator*(const Matrix<K, RM>& m, K alpha) {
-	Matrix<K, RM> res = m;
-	res.scl(alpha);
-	return res;
+template <typename K, bool RM> Matrix<K, RM> operator*(const Matrix<K, RM> &m, K alpha) {
+    Matrix<K, RM> res = m;
+    res.scl(alpha);
+    return res;
 }
-template<typename K, bool RM>
-Matrix<K, RM> operator*(K alpha, const Matrix<K, RM>& m) {
-	return m * alpha;
+template <typename K, bool RM> Matrix<K, RM> operator*(K alpha, const Matrix<K, RM> &m) {
+    return m * alpha;
 }
 
 } // namespace tensorium
diff --git a/includes/Tensorium/Core/MatrixKernels/GemmKernel_bigger.hpp b/includes/Tensorium/Core/MatrixKernels/GemmKernel_bigger.hpp
@@ -10,6 +10,8 @@
  * sizes
  *
  */
+#ifdef TENSORIUM_X86
+
 namespace tensorium {
 template <typename T> class GemmKernelBigger {
   public:
@@ -877,3 +879,4 @@ template <typename T> T GemmKernelBigger<T>::blockA_packed[MC * KC] __attribute_
 
 template <typename T> T GemmKernelBigger<T>::blockB_packed[NC * KC] __attribute__((aligned(64)));
 } // namespace tensorium
+#endif
diff --git a/includes/Tensorium/SIMD/SIMD.hpp b/includes/Tensorium/SIMD/SIMD.hpp
@@ -27,6 +27,21 @@
 #    define ALIGN 8
 #endif
 
+// Défauts si rien d'autre ne les fixe plus bas
+#ifndef UNROLL
+#    define UNROLL 4
+#endif
+#ifndef SIMD_WIDTH
+#    define SIMD_WIDTH 4
+#endif
+#ifndef ALIGN
+#    define ALIGN 16
+#endif
+
+// OpenMP : laisse l'include compiler seulement si activé par le compilateur
+#ifdef _OPENMP
+#    include <omp.h>
+#endif
 // disable x86-only prefetch macros on ARM
 #if !defined(__x86_64__)
 #    define _MM_HINT_T0 0
@@ -1071,7 +1086,15 @@ template <typename F> inline void dispatch_simd(F &&f) { f(DefaultISA{}); }
 
 namespace simd {
 
-// ───────────────────────── float32 (neon32_t) ─────────────────────────
+static inline float32x4_t andnot_f32(float32x4_t a, float32x4_t b) {
+    uint32x4_t na = veorq_u32(vreinterpretq_u32_f32(a), vdupq_n_u32(~0u));
+    return vreinterpretq_f32_u32(vandq_u32(na, vreinterpretq_u32_f32(b)));
+}
+static inline float64x2_t andnot_f64(float64x2_t a, float64x2_t b) {
+    uint64x2_t na = veorq_u64(vreinterpretq_u64_f64(a), vdupq_n_u64(~0ULL));
+    return vreinterpretq_f64_u64(vandq_u64(na, vreinterpretq_u64_f64(b)));
+}
+
 template <> struct SimdTraits<float, neon32_t> {
     using reg = float32x4_t;
     static constexpr size_t width = 4;
@@ -1083,6 +1106,7 @@ template <> struct SimdTraits<float, neon32_t> {
     static inline reg  loadu(const float *p) { return vld1q_f32(p); }
     static inline void store(float *p, reg v) { vst1q_f32(p, v); }
     static inline void storeu(float *p, reg v) { vst1q_f32(p, v); }
+    static inline void store_stream(float *p, reg v) { vst1q_f32(p, v); } // pas de NT-store NEON
     static inline reg  zero() { return vdupq_n_f32(0.f); }
     static inline reg  add(reg a, reg b) { return vaddq_f32(a, b); }
     static inline reg  sub(reg a, reg b) { return vsubq_f32(a, b); }
@@ -1092,6 +1116,17 @@ template <> struct SimdTraits<float, neon32_t> {
 #    else
     static inline reg fmadd(reg a, reg b, reg c) { return vaddq_f32(c, vmulq_f32(a, b)); }
 #    endif
+    static inline reg max(reg a, reg b) { return vmaxq_f32(a, b); }
+    static inline reg min(reg a, reg b) { return vminq_f32(a, b); }
+    static inline reg andnot(reg a, reg b) { return andnot_f32(a, b); }
+
+    // float (neon32_t)
+    static inline float extract(reg x, size_t idx) {
+        alignas(16) float t[4];
+        vst1q_f32(t, x);
+        return t[idx & 3];
+    }
+
     static inline float horizontal_add(reg v) {
         float32x2_t lo = vget_low_f32(v);
         float32x2_t hi = vget_high_f32(v);
@@ -1112,24 +1147,31 @@ template <> struct SimdTraits<double, neon64_t> {
     static inline reg  loadu(const double *p) { return vld1q_f64(p); }
     static inline void store(double *p, reg v) { vst1q_f64(p, v); }
     static inline void storeu(double *p, reg v) { vst1q_f64(p, v); }
+    static inline void store_stream(double *p, reg v) { vst1q_f64(p, v); }
     static inline reg  zero() { return vdupq_n_f64(0.0); }
     static inline reg  add(reg a, reg b) { return vaddq_f64(a, b); }
     static inline reg  sub(reg a, reg b) { return vsubq_f64(a, b); }
     static inline reg  mul(reg a, reg b) { return vmulq_f64(a, b); }
 #    if defined(__aarch64__)
-    static inline reg fmadd(reg a, reg b, reg c) { return vfmaq_f64(c, a, b); } // c + a*b
+    static inline reg fmadd(reg a, reg b, reg c) { return vfmaq_f64(c, a, b); }
 #    else
     static inline reg fmadd(reg a, reg b, reg c) { return vaddq_f64(c, vmulq_f64(a, b)); }
 #    endif
+    static inline reg    max(reg a, reg b) { return vmaxq_f64(a, b); }
+    static inline reg    min(reg a, reg b) { return vminq_f64(a, b); }
+    static inline reg    andnot(reg a, reg b) { return andnot_f32(a, b); }
+    static inline double extract(reg x, size_t idx = 0) {
+        alignas(16) double t[2];
+        vst1q_f64(t, x);
+        return t[idx & 1];
+    }
     static inline double horizontal_add(reg v) {
         float64x1_t s = vadd_f64(vget_low_f64(v), vget_high_f64(v));
         return vget_lane_f64(s, 0);
     }
 };
 
-// ───────────────────────── entier 64-bit (fallback simple) ─────────────────────────
-// On s'en tient à des opérations élémentaires (pas de mul 64x64 SIMD portable en NEON).
-template <> struct SimdTraits<size_t, neon64_t> {
+template <> struct SimdTraits<size_t, neon32_t> {
     using reg = uint64x2_t;
     static constexpr size_t width = 2;
     static constexpr size_t alignment = 16;
@@ -1140,11 +1182,11 @@ template <> struct SimdTraits<size_t, neon64_t> {
     static inline reg  loadu(const size_t *p) { return vld1q_u64((const uint64_t *)p); }
     static inline void store(size_t *p, reg v) { vst1q_u64((uint64_t *)p, v); }
     static inline void storeu(size_t *p, reg v) { vst1q_u64((uint64_t *)p, v); }
+    static inline void store_stream(size_t *p, reg v) { vst1q_u64((uint64_t *)p, v); }
     static inline reg  zero() { return vdupq_n_u64(0); }
     static inline reg  add(reg a, reg b) { return vaddq_u64(a, b); }
     static inline reg  sub(reg a, reg b) { return vsubq_u64(a, b); }
-    // mul element-wise (scalaire) pour rester portable
-    static inline reg mul(reg a, reg b) {
+    static inline reg  mul(reg a, reg b) {
         uint64_t A[2], B[2], R[2];
         vst1q_u64(A, a);
         vst1q_u64(B, b);
@@ -1161,10 +1203,19 @@ template <> struct SimdTraits<size_t, neon64_t> {
         R[1] = A[1] * B[1] + C[1];
         return vld1q_u64(R);
     }
+    static inline reg andnot(reg a, reg b) {
+        uint64x2_t na = veorq_u64(a, vdupq_n_u64(~0ULL));
+        return vandq_u64(na, b);
+    }
+    static inline size_t extract(reg x, size_t idx) {
+        alignas(16) uint64_t t[2];
+        vst1q_u64(t, x);
+        return (size_t)t[idx & 1];
+    }
     static inline uint64_t horizontal_add(reg v) {
-        uint64_t tmp[2];
-        vst1q_u64(tmp, v);
-        return tmp[0] + tmp[1];
+        alignas(16) uint64_t t[2];
+        vst1q_u64(t, v);
+        return t[0] + t[1];
     }
 };
 
@@ -1247,12 +1298,21 @@ template <> struct SimdTraits<std::complex<double>, neon64_t> {
 } // namespace simd
 
 namespace detail {
-template <typename Simd> static inline typename Simd::reg reduce_sum(typename Simd::reg v) {
-#    if defined(__x86_64__)
-    return v;
-#    else
-    return v; 
-#    endif
+inline float reduce_sum(float32x4_t v) { // float
+    float32x2_t lo = vget_low_f32(v);
+    float32x2_t hi = vget_high_f32(v);
+    float32x2_t s = vadd_f32(lo, hi);
+    s = vpadd_f32(s, s);
+    return vget_lane_f32(s, 0);
+}
+inline double reduce_sum(float64x2_t v) { // double
+    float64x1_t s = vadd_f64(vget_low_f64(v), vget_high_f64(v));
+    return vget_lane_f64(s, 0);
+}
+inline uint64_t reduce_sum(uint64x2_t v) { // entier
+    uint64_t t[2];
+    vst1q_u64(t, v);
+    return t[0] + t[1];
 }
 } // namespace detail