Skip to content

Commit 863b10a

Browse files
committed
Refactor Sollya integration and enhance trigonometric function precision
- Completely rewrote Sollya integration with a new Python CLI tool (tools/sollya/cli.py) - Replaced the old generate.py script with a more robust implementation - Added comprehensive Sollya core library (tools/sollya/core.sol) with utility functions - Created new SIMD-optimized lookup table implementation (npsr/lut-inl.h) - Significantly enhanced the Precise class for floating-point precision control - Refactored trigonometric data generation scripts with better documentation - Updated constants generation with improved Cody-Waite and Payne-Hanek reduction - Added kpi16-inl.h.sol for high-precision π/16 table generation - Removed deprecated utils-inl.h and old trigonometric data files - Improved project configuration in pyproject.toml and .spin/cmds.py - Enhanced code documentation throughout the codebase
1 parent 751831f commit 863b10a

File tree

22 files changed

+1300
-611
lines changed

22 files changed

+1300
-611
lines changed

.spin/cmds.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
1-
import os
21
import pathlib
32
import sys
43
import click
5-
import spin
6-
from spin.cmds import meson
74

85
curdir = pathlib.Path(__file__).parent
96
rootdir = curdir.parent
7+
toolsdir = rootdir / "tools"
8+
sys.path.insert(0, str(toolsdir))
109

1110

12-
@click.command(help="Generate sollya python based files")
11+
@click.command(help="Generate sollya c++/python based files")
1312
@click.option("-f", "--force", is_flag=True, help="Force regenerate all files")
14-
def generate(*, force):
15-
spin.util.run(
16-
["python", str(rootdir / "tools" / "sollya" / "generate.py")]
17-
+ (["--force"] if force else []),
18-
)
13+
def sollya(*, force):
14+
import sollya # type: ignore[import]
15+
16+
sollya.main(force=force)

npsr/common.h

Lines changed: 0 additions & 37 deletions
This file was deleted.

npsr/hwy.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#ifndef NPSR_HWY_H_
2+
#define NPSR_HWY_H_
3+
4+
#include <hwy/highway.h>
5+
6+
#endif // NPSR_HWY_H_
7+
8+
#if defined(NPSR_HWY_FOREACH_H_) == defined(HWY_TARGET_TOGGLE) // NOLINT
9+
#ifdef NPSR_HWY_FOREACH_H_
10+
#undef NPSR_HWY_FOREACH_H_
11+
#else
12+
#define NPSR_HWY_FOREACH_H_
13+
#endif
14+
15+
HWY_BEFORE_NAMESPACE();
16+
namespace npsr::HWY_NAMESPACE {
17+
namespace hn = hwy::HWY_NAMESPACE;
18+
using hn::DFromV;
19+
using hn::MFromD;
20+
using hn::Rebind;
21+
using hn::RebindToUnsigned;
22+
using hn::TFromD;
23+
using hn::TFromV;
24+
using hn::VFromD;
25+
constexpr bool kNativeFMA = HWY_NATIVE_FMA != 0;
26+
27+
inline HWY_ATTR void DummyToSuppressUnusedWarning() {}
28+
} // namespace npsr::HWY_NAMESPACE
29+
HWY_AFTER_NAMESPACE();
30+
31+
#endif // NPSR_HWY_FOREACH_H_

npsr/lut-inl.h

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
#if defined(NPSR_LUT_INL_H_) == defined(HWY_TARGET_TOGGLE) // NOLINT
2+
#ifdef NPSR_LUT_INL_H_
3+
#undef NPSR_LUT_INL_H_
4+
#else
5+
#define NPSR_LUT_INL_H_
6+
#endif
7+
8+
#include <tuple>
9+
10+
#include "npsr/hwy.h"
11+
12+
HWY_BEFORE_NAMESPACE();
13+
14+
namespace npsr::HWY_NAMESPACE {
15+
16+
/**
17+
* @brief SIMD-optimized lookup table implementation
18+
*
19+
* This class provides an efficient lookup table.
20+
* It stores data in both row-major and column-major
21+
* formats to optimize different access patterns.
22+
*
23+
* @tparam T Element type (must match SIMD vector element type)
24+
* @tparam kRows Number of rows in the lookup table
25+
* @tparam kCols Number of columns in the lookup table
26+
*
27+
* Example usage:
28+
* @code
29+
* // Create a 2x4 lookup table
30+
* constexpr Lut lut{{1.0f, 2.0f, 3.0f, 4.0f}, {5.0f, 6.0f, 7.0f, 8.0f}};
31+
* // Load values using SIMD indices
32+
* auto indices = Set(d, 2); // SIMD vector of indices
33+
* Vec<D> out0, out1;
34+
* lut.Load(indices, out0, out1);
35+
* @endcode
36+
*/
37+
template <typename T, size_t kRows, size_t kCols>
38+
class Lut {
39+
public:
40+
static constexpr size_t kLength = kRows * kCols;
41+
42+
/**
43+
* @brief Construct a lookup table from row arrays
44+
*
45+
* @tparam ColSizes Size of each row array (deduced)
46+
* @param rows Variable number of arrays, each representing a row
47+
*
48+
* @note All rows must have exactly kCols elements
49+
* @note The constructor is constexpr for compile-time initialization
50+
*/
51+
template <size_t... ColSizes>
52+
constexpr Lut(const T (&...rows)[ColSizes]) : row_{} {
53+
// Check that we have the right number of rows
54+
static_assert(sizeof...(rows) == kRows,
55+
"Number of rows doesn't match template parameter");
56+
// Check that all rows have the same number of columns
57+
static_assert(((ColSizes == kCols) && ...),
58+
"All rows must have the same number of columns");
59+
60+
// Copy data using recursive template approach
61+
ToRowMajor_<0>(rows...);
62+
}
63+
64+
/**
65+
* @brief Load values from the LUT using SIMD indices
66+
*
67+
* This method performs efficient SIMD lookups by selecting the optimal
68+
* implementation based on the vector size and LUT dimensions.
69+
*
70+
* @tparam VU SIMD vector type for indices
71+
* @tparam OutV Output SIMD vector types (must match number of rows)
72+
* @param idx SIMD vector of column indices
73+
* @param out Output vectors (one per row)
74+
*
75+
* @note The number of output vectors must exactly match kRows
76+
* @note Index values must be in range [0, kCols)
77+
*/
78+
template <typename VU, typename... OutV>
79+
HWY_ATTR void Load(VU idx, OutV &...out) const {
80+
static_assert(sizeof...(OutV) == kRows,
81+
"Number of output vectors must match number of rows in LUT");
82+
using namespace hn;
83+
using TU = TFromV<VU>;
84+
static_assert(sizeof(TU) == sizeof(T),
85+
"Index type must match LUT element type");
86+
// Row-major based optimization
87+
LoadRow_(idx, out...);
88+
}
89+
90+
private:
91+
/// Convert input rows to row-major storage format
92+
template <size_t RowIDX, size_t... ColSizes>
93+
constexpr void ToRowMajor_(const T (&...rows)[ColSizes]) {
94+
if constexpr (RowIDX < kRows) {
95+
auto row_array = std::get<RowIDX>(std::make_tuple(rows...));
96+
for (size_t col = 0; col < kCols; ++col) {
97+
row_[RowIDX * kCols + col] = row_array[col];
98+
}
99+
ToRowMajor_<RowIDX + 1>(rows...);
100+
}
101+
}
102+
103+
/// Dispatch to optimal row-load implementation based on vector/LUT size
104+
template <size_t Off = 0, typename VU, typename... OutV>
105+
HWY_ATTR void LoadRow_(VU idx, OutV &...out) const {
106+
using namespace hn;
107+
using DU = DFromV<VU>;
108+
const DU du;
109+
using D = Rebind<T, DU>;
110+
const D d;
111+
112+
HWY_LANES_CONSTEXPR size_t kLanes = Lanes(du);
113+
if HWY_LANES_CONSTEXPR (kLanes == kCols) {
114+
// Vector size matches table width - use single table lookup
115+
const auto ind = IndicesFromVec(d, idx);
116+
LoadX1_<Off>(ind, out...);
117+
} else if HWY_LANES_CONSTEXPR (kLanes * 2 == kCols) {
118+
// Vector size is half table width - use two table lookup
119+
const auto ind = IndicesFromVec(d, idx);
120+
LoadX2_<Off>(ind, out...);
121+
} else {
122+
// Fallback to gather for other configurations
123+
LoadGather_<Off>(idx, out...);
124+
}
125+
}
126+
127+
// Load using single table lookup (vector size == table width)
128+
template <size_t Off = 0, typename VInd, typename OutV0, typename... OutV>
129+
HWY_ATTR void LoadX1_(const VInd &ind, OutV0 &out0, OutV &...out) const {
130+
using namespace hn;
131+
using D = DFromV<OutV0>;
132+
const D d;
133+
134+
const OutV0 lut0 = LoadU(d, row_ + Off);
135+
out0 = TableLookupLanes(lut0, ind);
136+
137+
if constexpr (sizeof...(OutV) > 0) {
138+
LoadX1_<Off + kCols>(ind, out...);
139+
}
140+
}
141+
142+
// Load using two table lookups (vector size == table width / 2)
143+
template <size_t Off = 0, typename VInd, typename OutV0, typename... OutV>
144+
HWY_ATTR void LoadX2_(const VInd &ind, OutV0 &out0, OutV &...out) const {
145+
using namespace hn;
146+
using D = DFromV<OutV0>;
147+
const D d;
148+
149+
constexpr size_t kLanes = kCols / 2;
150+
const OutV0 lut0 = LoadU(d, row_ + Off);
151+
const OutV0 lut1 = LoadU(d, row_ + Off + kLanes);
152+
out0 = TwoTablesLookupLanes(d, lut0, lut1, ind);
153+
154+
if constexpr (sizeof...(OutV) > 0) {
155+
LoadX2_<Off + kCols>(ind, out...);
156+
}
157+
}
158+
159+
// General fallback using gather instructions
160+
template <size_t Off = 0, typename VU, typename OutV0, typename... OutV>
161+
HWY_ATTR void LoadGather_(const VU &idx, OutV0 &out0, OutV &...out) const {
162+
using namespace hn;
163+
using D = DFromV<OutV0>;
164+
const D d;
165+
out0 = GatherIndex(d, row_ + Off, BitCast(RebindToSigned<D>(), idx));
166+
if constexpr (sizeof...(OutV) > 0) {
167+
LoadGather_<Off + kCols>(idx, out...);
168+
}
169+
}
170+
171+
// Row-major
172+
HWY_ALIGN T row_[kLength];
173+
};
174+
175+
/**
176+
* @brief Deduction guide for automatic dimension detection
177+
*
178+
* Allows constructing a Lut without explicitly specifying dimensions:
179+
* @code
180+
* Lut lut{row0, row1, row2}; // Dimensions deduced from arrays
181+
* @endcode
182+
*/
183+
template <typename T, size_t First, size_t... Rest>
184+
Lut(const T (&first)[First], const T (&...rest)[Rest])
185+
-> Lut<T, 1 + sizeof...(Rest), First>;
186+
187+
/**
188+
* @brief Factory function that requires explicit type specification
189+
*
190+
* This approach forces users to specify the type T explicitly while
191+
* automatically deducing the dimensions from the array arguments.
192+
*
193+
* Note: We use MakeLut since partial deduction guides (e.g., Lut<float>{...})
194+
* require C++20, but this codebase targets C++17.
195+
*
196+
* @tparam T Element type (must be explicitly specified)
197+
* @param first First row array
198+
* @param rest Additional row arrays
199+
* @return Lut with deduced dimensions
200+
*
201+
* Usage:
202+
* @code
203+
* auto lut = MakeLut<float>(row0, row1, row2); // T explicit, dimensions
204+
* deduced
205+
* @endcode
206+
*/
207+
template <typename T, size_t First, size_t... Rest>
208+
constexpr auto MakeLut(const T (&first)[First], const T (&...rest)[Rest]) {
209+
return Lut<T, 1 + sizeof...(Rest), First>{first, rest...};
210+
}
211+
212+
} // namespace npsr::HWY_NAMESPACE
213+
214+
HWY_AFTER_NAMESPACE();
215+
216+
#endif // NPSR_LUT_INL_H_

npsr/npsr.h

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
// To include them once per target, which is ensured by the toggle check.
2-
// clang-format off
3-
#if defined(_NPSR_NPSR_H_) == defined(HWY_TARGET_TOGGLE) // NOLINT
4-
#ifdef _NPSR_NPSR_H_
5-
#undef _NPSR_NPSR_H_
2+
#if defined(NPSR_NPSR_H_) == defined(HWY_TARGET_TOGGLE) // NOLINT
3+
#ifdef NPSR_NPSR_H_
4+
#undef NPSR_NPSR_H_
65
#else
7-
#define _NPSR_NPSR_H_
6+
#define NPSR_NPSR_H_
87
#endif
98

109
#include "npsr/trig/inl.h"
1110

12-
#endif // _NPSR_NPSR_H_
11+
#endif // NPSR_NPSR_H_

0 commit comments

Comments
 (0)