fix up

seiko2plus · seiko2plus · commit 77971c2791a3 · 2025-09-07T23:38:43.000+03:00
diff --git a/npsr/hwy.h b/npsr/hwy.h
@@ -24,7 +24,7 @@ using hn::TFromV;
 using hn::VFromD;
 constexpr bool kNativeFMA = HWY_NATIVE_FMA != 0;
 
-HWY_ATTR void DummyToSuppressUnusedWarning() {}
+inline HWY_ATTR void DummyToSuppressUnusedWarning() {}
 }  // namespace npsr::HWY_NAMESPACE
 HWY_AFTER_NAMESPACE();
 
diff --git a/npsr/lut-inl.h b/npsr/lut-inl.h
@@ -5,6 +5,8 @@
 #define NPSR_LUT_INL_H_
 #endif
 
+#include <tuple>
+
 #include "npsr/hwy.h"
 
 HWY_BEFORE_NAMESPACE();
@@ -107,22 +109,16 @@ class Lut {
     using D = Rebind<T, DU>;
     const D d;
 
-#if !HWY_HAVE_SCALABLE
-    constexpr size_t kLanes = Lanes(du);
-    if constexpr (kLanes == kCols) {
+    HWY_LANES_CONSTEXPR size_t kLanes = Lanes(du);
+    if HWY_LANES_CONSTEXPR (kLanes == kCols) {
       // Vector size matches table width - use single table lookup
       const auto ind = IndicesFromVec(d, idx);
       LoadX1_<Off>(ind, out...);
-    } else if constexpr (kLanes * 2 == kCols) {
+    } else if HWY_LANES_CONSTEXPR (kLanes * 2 == kCols) {
       // Vector size is half table width - use two table lookup
       const auto ind = IndicesFromVec(d, idx);
       LoadX2_<Off>(ind, out...);
-    }
-#else
-    if constexpr (0) {
-    }
-#endif
-    else {
+    } else {
       // Fallback to gather for other configurations
       LoadGather_<Off>(idx, out...);
     }
@@ -135,8 +131,8 @@ class Lut {
     using D = DFromV<OutV0>;
     const D d;
 
-    const OutV0 lut0 = Load(d, row_ + Off);
-    out0 = TableLookupLanes(d, lut0, ind);
+    const OutV0 lut0 = LoadU(d, row_ + Off);
+    out0 = TableLookupLanes(lut0, ind);
 
     if constexpr (sizeof...(OutV) > 0) {
       LoadX1_<Off + kCols>(ind, out...);
diff --git a/npsr/precise.h b/npsr/precise.h
@@ -27,15 +27,6 @@ constexpr auto kNoSpecialCases = _NoSpecialCases{};
 constexpr auto kNoExceptions = _NoExceptions{};
 constexpr auto kLowAccuracy = _LowAccuracy{};
 
-// Rounding mode control
-// Forces a specific rounding mode during computation
-struct Round {
-  struct _Force {
-    static constexpr const char* kName = "kForce";
-  };
-  static constexpr auto kForce = _Force{};
-};
-
 // Subnormal (denormal) number handling modes
 // Controls how the CPU handles numbers smaller than the minimum normalized
 // value
@@ -52,12 +43,34 @@ struct Subnormal {
 
 // Floating-point exception flags
 // These match the standard C library FE_* macros
-struct FPExceptions {
+class FPExceptions {
+ public:
   static constexpr auto kNone = 0;
   static constexpr auto kInvalid = FE_INVALID;
   static constexpr auto kDivByZero = FE_DIVBYZERO;
   static constexpr auto kOverflow = FE_OVERFLOW;
   static constexpr auto kUnderflow = FE_UNDERFLOW;
+
+  void Raise(int errors) noexcept { mask_ |= errors; }
+
+ protected:
+  void Load() noexcept {
+    loaded_ = std::fegetexceptflag(&saved_, FE_ALL_EXCEPT) == 0;
+  }
+
+  ~FPExceptions() noexcept {
+    if (loaded_) {
+      std::fesetexceptflag(&saved_, FE_ALL_EXCEPT);
+    }
+    if (mask_ != kNone) {
+      std::feraiseexcept(mask_);
+    }
+  }
+
+ private:
+  bool loaded_ = false;
+  int mask_ = kNone;
+  std::fexcept_t saved_;
 };
 
 /**
@@ -84,7 +97,6 @@ struct FPExceptions {
  * - kNoLargeArgument: Skip extended precision reduction for large arguments
  * - kNoSpecialCases: Skip NaN/Inf handling (assumes finite inputs)
  * - kNoExceptions: Disable FP exception tracking for better performance
- * - Round::kForce: Force round-to-nearest mode
  * - Subnormal::kDAZ/kFTZ: Flush subnormals to zero for performance
  * - Subnormal::kIEEE754: Strict IEEE 754 compliance (default if DAZ/FTZ not
  * specified)
@@ -109,23 +121,13 @@ struct FPExceptions {
  * ```
  */
 template <typename... Args>
-class Precise {
+class Precise : public FPExceptions {
  public:
   // Default constructor saves current FP state
   Precise() noexcept {
     // Save exception flags unless disabled
     if constexpr (!kNoExceptions) {
-      fegetexceptflag(&_exceptions, FE_ALL_EXCEPT);
-    }
-
-    // Force rounding mode if requested
-    if constexpr (kRoundForce) {
-      _rounding_mode = fegetround();
-      int new_mode = _NewRoundingMode();
-      if (_rounding_mode != new_mode) {
-        _retrieve_rounding_mode = true;
-        fesetround(new_mode);
-      }
+      FPExceptions::Load();
     }
   }
 
@@ -136,33 +138,8 @@ class Precise {
     // This constructor exists to enable Precise{tag1, tag2, ...} syntax
   }
 
-  // Restore saved exception flags to FP environment
-  void FlushExceptions() noexcept {
-    if constexpr (!kNoExceptions) {
-      fesetexceptflag(&_exceptions, FE_ALL_EXCEPT);
-    }
-  }
-
-  // Record that an exception occurred (will be raised on destruction)
-  void Raise(int errors) noexcept {
-    static_assert(!kNoExceptions,
-                  "Cannot raise exceptions in NoExceptions mode");
-    _exceptions |= errors;
-  }
-
-  // Destructor restores original FP state
-  ~Precise() noexcept {
-    FlushExceptions();
-    if constexpr (kRoundForce) {
-      if (_retrieve_rounding_mode) {
-        fesetround(_rounding_mode);
-      }
-    }
-  }
-
   // Compile-time configuration queries
   // These allow algorithms to optimize based on precision requirements
-
   static constexpr bool kNoExceptions = (is_same_v<_NoExceptions, Args> || ...);
   static constexpr bool kNoLargeArgument =
       (is_same_v<_NoLargeArgument, Args> || ...);
@@ -176,9 +153,6 @@ class Precise {
   static constexpr bool kSpecialCases = !kNoSpecialCases;
   static constexpr bool kExceptions = !kNoExceptions;
 
-  // Rounding mode configuration
-  static constexpr bool kRoundForce = (is_same_v<Round::_Force, Args> || ...);
-
   // Subnormal handling configuration
   static constexpr bool kDAZ = (is_same_v<Subnormal::_DAZ, Args> || ...);
   static constexpr bool kFTZ = (is_same_v<Subnormal::_FTZ, Args> || ...);
@@ -193,16 +167,6 @@ class Precise {
 
   // Default to IEEE754 if no subnormal mode specified
   static constexpr bool kIEEE754 = _kIEEE754 || !(kDAZ || kFTZ);
-
- private:
-  // Currently only supports round-to-nearest mode
-  // Could be extended to support other modes (toward zero, up, down)
-  int _NewRoundingMode() const { return FE_TONEAREST; }
-
-  // Saved floating-point state
-  int _rounding_mode = 0;
-  bool _retrieve_rounding_mode = false;
-  fexcept_t _exceptions;  // Saved exception flags
 };  // namespace npsr
 
 // Deduction guides for convenient construction
diff --git a/npsr/trig/high-inl.h b/npsr/trig/high-inl.h
@@ -56,7 +56,7 @@ HWY_INLINE V High(V x) {
     //   N' = N - 0.5
     n = Sub(n, Set(d, 0.5f));
   }
-  auto WideCal = [](VW nh, VW xh_abs) -> VW {
+  auto WideCal = [](const VW &nh, const VW &xh_abs) -> VW {
     const DFromV<VW> dw;
     constexpr auto kPiPrec35 = data::kPiPrec35<true>;
     VW r = NegMulAdd(nh, Set(dw, kPiPrec35[0]), xh_abs);
diff --git a/npsr/trig/low-inl.h b/npsr/trig/low-inl.h
@@ -120,13 +120,13 @@ HWY_API V Low(V x) {
     r_lo = NegMulAdd(n, Set(d, kPi[3]), r_lo);
   }
 
-  if (kIsSingle) {
+  if constexpr (kIsSingle) {
     r = r_lo;
   }
   V r2 = Mul(r, r);
   V poly = PolyLow<IS_COS>(r, r2);
 
-  if (!kIsSingle) {
+  if constexpr (!kIsSingle) {
     V r2_corr = Mul(r2, r_lo);
     poly = MulAdd(r2_corr, poly, r_lo);
   }
diff --git a/tools/sollya/cli.py b/tools/sollya/cli.py
@@ -10,7 +10,6 @@
 from pathlib import Path
 from typing import Final
 from dataclasses import dataclass
-from itertools import batched
 
 
 # ANSI color codes for terminal output
diff --git a/tools/sollya/core.sol b/tools/sollya/core.sol
@@ -120,16 +120,29 @@ procedure PrettyJoin(pList, pSfx, pSep, pLineEvery) {
   return r;
 };
 
-// C array formatting procedures
+// Ensure zeros are represented as 0.0 for C++ template deduction
+procedure FixZero(pList) {
+  var r, i;
+  r = [||];
+  for i in pList do {
+    if (i == 0) then {
+      r = r :. "0.0"; // Ensure zero is represented as 0.0
+    } else {
+      r = r :. i;
+    };
+  };
+  return r;
+};
 
+// C array formatting procedures
 // Generate C array initializer
 procedure CArray(pList, pLineEvery) {
   return "{\n" @ PrettyJoin(pList, "", ", ", pLineEvery) @ "}";
 };
 
 // Generate C array with type-specific suffix (e.g., "f" for float)
 procedure CArrayT(pT, pList, pLineEvery) {
-  return "{\n" @ PrettyJoin(pList, pT.kCSFX, ", ", pLineEvery) @ "}";
+  return "{\n" @ PrettyJoin(FixZero(pList), pT.kCSFX, ", ", pLineEvery) @ "}";
 };
 
 // Generate C array with unsigned integer suffix

Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ HWY_INLINE V High(V x) {`
`56`	`56`	`// N' = N - 0.5`
`57`	`57`	`n = Sub(n, Set(d, 0.5f));`
`58`	`58`	`}`
`59`		`- auto WideCal = [](VW nh, VW xh_abs) -> VW {`
	`59`	`+ auto WideCal = [](const VW &nh, const VW &xh_abs) -> VW {`
`60`	`60`	`const DFromV<VW> dw;`
`61`	`61`	`constexpr auto kPiPrec35 = data::kPiPrec35<true>;`
`62`	`62`	`VW r = NegMulAdd(nh, Set(dw, kPiPrec35[0]), xh_abs);`
Original file line number	Diff line number	Diff line change
`@@ -120,13 +120,13 @@ HWY_API V Low(V x) {`
`120`	`120`	`r_lo = NegMulAdd(n, Set(d, kPi[3]), r_lo);`
`121`	`121`	`}`
`122`	`122`
`123`		`- if (kIsSingle) {`
	`123`	`+ if constexpr (kIsSingle) {`
`124`	`124`	`r = r_lo;`
`125`	`125`	`}`
`126`	`126`	`V r2 = Mul(r, r);`
`127`	`127`	`V poly = PolyLow<IS_COS>(r, r2);`
`128`	`128`
`129`		`- if (!kIsSingle) {`
	`129`	`+ if constexpr (!kIsSingle) {`
`130`	`130`	`V r2_corr = Mul(r2, r_lo);`
`131`	`131`	`poly = MulAdd(r2_corr, poly, r_lo);`
`132`	`132`	`}`