Skip to content

Commit db713f3

Browse files
authored
Cpusummary Note (#133)
* Adjust blocking behavior to cap cache use at L2e and L3e * Precompile * Bump version * Don't use Aqua to test for ambiguities because of ForwardDiff * More debug, add a check for first cache size being large enough * Try to avoid Windows issue * init before precomp * add CPUSummary note * take 2 * updates * disable project_toml_formatting test * Hopefully more reliable first_cache
1 parent 82a68e7 commit db713f3

File tree

8 files changed

+48
-31
lines changed

8 files changed

+48
-31
lines changed

Project.toml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
name = "Octavian"
22
uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
33
authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
4-
version = "0.3.12"
4+
version = "0.3.13"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
8+
CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
89
IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
910
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
1011
ManualMemory = "d125e4d3-2237-4719-b19c-fa641b8a4667"
@@ -15,14 +16,15 @@ ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5"
1516
VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1617

1718
[compat]
18-
ArrayInterface = "3.1.14"
19+
ArrayInterface = "3.1.14, 5.0.1"
20+
CPUSummary = "0.1.1 - 0.1.8, 0.1.14"
1921
IfElse = "0.1"
2022
LoopVectorization = "0.12.86"
2123
ManualMemory = "0.1.1"
2224
PolyesterWeave = "0.1.1"
2325
Requires = "1"
24-
Static = "0.2, 0.3, 0.4"
25-
ThreadingUtilities = "0.4.6"
26+
Static = "0.2, 0.3, 0.4, 0.6"
27+
ThreadingUtilities = "0.5"
2628
VectorizationBase = "0.21.15"
2729
julia = "1.6"
2830

README.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,16 @@
1818
[ci-julia-nightly-img]: https://github.com/JuliaLinearAlgebra/Octavian.jl/workflows/CI%20(Julia%20nightly)/badge.svg "Continuous Integration (Julia nightly)"
1919
[codecov-img]: https://codecov.io/gh/JuliaLinearAlgebra/Octavian.jl/branch/master/graph/badge.svg "Code Coverage"
2020

21-
Octavian.jl
22-
is a multi-threaded BLAS-like library that provides pure Julia
21+
To make sure CPUSummary 1.11 and newer are using `Hwloc`, you may want to run
22+
```julia
23+
julia> using CPUSummary
24+
25+
julia> CPUSummary.use_hwloc(true);
26+
```
27+
which will hopefully enable accurate hardware information. This is the default,
28+
so it should typically be unnecessary.
29+
30+
Octavian.jl is a multi-threaded BLAS-like library that provides pure Julia
2331
matrix multiplication on the CPU, built on top of
2432
[LoopVectorization.jl](https://github.com/chriselrod/LoopVectorization.jl).
2533

src/Octavian.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ using Requires: @require
55
using VectorizationBase, ArrayInterface, LoopVectorization
66

77
using VectorizationBase: align, AbstractStridedPointer, zstridedpointer, vsub_nsw, assume,
8-
static_sizeof, StridedPointer, gesp, pause, pick_vector_width, has_feature,
9-
cache_size, num_cores, cache_inclusive, cache_linesize
8+
static_sizeof, StridedPointer, gesp, pause, pick_vector_width, has_feature
9+
using CPUSummary: cache_size, num_cores, cache_inclusive, cache_linesize
1010
using LoopVectorization: preserve_buffer, CloseOpen, UpperBoundedInteger
1111
using ArrayInterface: size, strides, offsets, indices, axes, StrideIndex
1212
using IfElse: ifelse
@@ -15,7 +15,7 @@ using Static: StaticInt, Zero, One, StaticBool, True, False, gt, eq, StaticFloat
1515
roundtostaticint, floortostaticint
1616
using ManualMemory: MemoryBuffer, load, store!
1717

18-
using ThreadingUtilities: _atomic_add!, _atomic_load, _atomic_store!, launch, wait
18+
using ThreadingUtilities: _atomic_add!, _atomic_load, _atomic_store!, launch, wait, SPIN
1919

2020
export StaticInt
2121
export matmul!

src/funcptrs.jl

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ function (::LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd})(p::Ptr{UInt}) where {P,TC,TA
1010
offset, K = load(p, Kd, offset)
1111
offset, N = load(p, Nd, offset)
1212
_call_loopmul!(C, A, B, α, β, M, K, N, Val{P}())
13+
_atomic_store!(p, SPIN)
1314
nothing
1415
end
1516
@inline _call_loopmul!(C, A, B, α, β, M, K, N, ::Val{false}) = loopmul!(C, A, B, α, β, M, K, N)
@@ -39,6 +40,7 @@ function (::SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂})(
3940
offset, id = load(p, ID, offset)
4041
offset, total_ids = load(p, TT, offset)
4142
sync_mul!(C, A, B, α, β, M, K, N, atomicp, bcachep, id, total_ids, StaticFloat64{W₁}(), StaticFloat64{W₂}(), StaticFloat64{R₁}(), StaticFloat64{R₂}())
43+
_atomic_store!(p, SPIN)
4244
nothing
4345
end
4446

@@ -63,11 +65,17 @@ end
6365
nothing
6466
end
6567

66-
@inline function setup_syncmul!(
68+
@inline function launch_thread_mul!(C, A, B, α, β, M, K, N, tid::UInt32, ::Val{P}) where {P}
69+
launch(setup_matmul!, tid, C, A, B, α, β, M, K, N, Val{P}())
70+
end
71+
72+
struct SyncMulLauncher{W₁, W₂, R₁, R₂} end
73+
@inline function (::SyncMulLauncher{W₁, W₂, R₁, R₂})(
6774
p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd,
68-
ap::Ptr{UInt32},bcp::BCP,id::ID,tt::TT,::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
75+
ap::Ptr{UInt32},bcp::BCP,id::ID,tt::TT
6976
) where {TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}
70-
offset = store!(p, cfuncpointer(SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}()), sizeof(UInt))
77+
fptr = cfuncpointer(SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}())
78+
offset = store!(p, fptr, sizeof(UInt))
7179
offset = store!(p, C, offset)
7280
offset = store!(p, A, offset)
7381
offset = store!(p, B, offset)
@@ -82,20 +90,11 @@ end
8290
offset = store!(p, tt, offset)
8391
nothing
8492
end
85-
86-
@inline function launch_thread_mul!(C, A, B, α, β, M, K, N, tid::UInt32, ::Val{P}) where {P}
87-
launch(setup_matmul!, tid, C, A, B, α, β, M, K, N, Val{P}())
88-
end
8993
@inline function launch_thread_mul!(
90-
C, A, B, α, β, M, K, N, ap, bcp, tid, id, tt, ::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
94+
C, A, B, α, β, M, K, N, ap, bcp, tid, id, tt,
95+
::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
9196
) where {W₁,W₂,R₁,R₂}
92-
launch(tid, C, A, B, α, β, M, K, N, ap, bcp, id, tt) do p, C, A, B, α, β, M, K, N, ap, bcp, id, tt
93-
Base.@_inline_meta
94-
setup_syncmul!(
95-
p, C, A, B, α, β, M, K, N, ap, bcp, id, tt,
96-
StaticFloat64{W₁}(),StaticFloat64{W₂}(),StaticFloat64{R₁}(),StaticFloat64{R₂}()
97-
)
98-
end
97+
launch(SyncMulLauncher{W₁, W₂, R₁, R₂}(), tid, C, A, B, α, β, M, K, N, ap, bcp, id, tt)
9998
end
10099

101100

src/global_constants.jl

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,12 @@ R₁Default() = R₁Default(has_feature(Val(:x86_64_avx512f)))
4747
R₂Default() = R₂Default(has_feature(Val(:x86_64_avx512f)))
4848

4949

50+
@static if Sys.ARCH === :x86_64 || Sys.ARCH === :i686
51+
first_cache() = StaticInt{2}()
52+
else
53+
first_cache() = StaticInt{1}()
54+
end
5055

51-
_first_cache(::StaticInt{1}) = StaticInt{1}()
52-
_first_cache(::StaticInt) = StaticInt{2}()
53-
first_cache() = _first_cache(VectorizationBase.num_l2cache())
5456
second_cache() = first_cache() + One()
5557

5658
_first_cache_size(fcs::StaticInt) = ifelse(eq(first_cache(), StaticInt(2)) & cache_inclusive(StaticInt(2)), fcs - cache_size(One()), fcs)

test/aqua.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
@testset "Aqua.jl" begin
2-
Aqua.test_all(Octavian, ambiguities=false)
2+
Aqua.test_all(Octavian, ambiguities=false, project_toml_formatting=false)
33
@test isempty(Test.detect_ambiguities(Octavian))
44
end

test/matmul_coverage.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
n_values = [1, 10, 20, 50, 100, 150, 200]
2-
k_values = [10, 20, 50, 100, 150, 200]
3-
m_values = [10, 20, 50, 100, 150, 200]
1+
n_values = [1, 10, 20, 50, 100, 150, 200, 400]
2+
k_values = [10, 20, 50, 100, 150, 200, 400]
3+
m_values = [10, 20, 50, 100, 150, 200, 400]
44

55

66

test/runtests.jl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
import CPUSummary
2+
# Increasing the number of threads must be done before importing Octavian
3+
if Threads.nthreads() > 1 && Sys.CPU_THREADS > 1 && CPUSummary.num_cores() == 1
4+
CPUSummary.num_cores() = CPUSummary.static(2)
5+
end
6+
17
import Octavian
28

39
import Aqua

0 commit comments

Comments
 (0)