Skip to content

Commit b954b9b

Browse files
authored
Adjust blocking behavior to cap cache use at L2e and L3e (#126)
* Adjust blocking behavior to cap cache use at L2e and L3e * Update tilesearch and a few params
1 parent f2e527c commit b954b9b

File tree

7 files changed

+51
-30
lines changed

7 files changed

+51
-30
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "Octavian"
22
uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
33
authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
4-
version = "0.3.9"
4+
version = "0.3.10"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"

benchmark/tilesearch.jl

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@ function matmul_pack_ab!(C, A, B, ::Val{W₁}, ::Val{W₂}, ::Val{R₁}, ::Val{R
55
M, N = size(C); K = size(B,1)
66
zc, za, zb = Octavian.zstridedpointer.((C,A,B))
77
nspawn = VectorizationBase.num_cores()
8+
threads, torelease = Octavian.PolyesterWeave.__request_threads((nspawn-1)%UInt32, Octavian.PolyesterWeave.worker_pointer(), nothing)
89
t = Inf
910
GC.@preserve C A B begin
1011
for _ 1:2
11-
t = min(t, @elapsed(Octavian.matmul_pack_A_and_B!(zc, za, zb, Octavian.One(), Octavian.Zero(), M, K, N, Int(nspawn), F64(W₁), F64(W₂), F64(R₁), F64(R₂))))
12+
t = min(t, @elapsed(Octavian.matmul_pack_A_and_B!(zc, za, zb, Octavian.One(), Octavian.Zero(), M, K, N, threads, F64(W₁), F64(W₂), F64(R₁), F64(R₂))))
1213
end
1314
end
15+
Octavian.PolyesterWeave.free_threads!(torelease)
1416
return t
1517
end
1618

@@ -119,15 +121,16 @@ using Optim
119121
hours = 60.0*60.0; days = 24hours;
120122
init = Float64[Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default()]
121123
lower = 0.75 .* init;
122-
upper = [1.25init[1], 1.25init[2], 0.75*init[3] + 0.25, 0.75*init[4] + 0.25];
124+
# upper = [1.25init[1], 1.25init[2], 0.75*init[3] + 0.25, 0.75*init[4] + 0.25];
125+
upper = [0.9, 1.25init[2], 0.999, 0.999];
123126
# init = [0.001, 0.9754033943603924, 0.5711159869399494, 0.7547361860432168];
124127

128+
#=
125129
opt = Optim.optimize(
126130
matmul_objective, init, ParticleSwarm(lower = lower, upper = upper),
127131
Optim.Options(iterations = 10^6, time_limit = 8hours)
128132
);
129-
130-
133+
=#
131134

132135

133136

src/block_sizes.jl

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -178,25 +178,47 @@ independently of `M`, this algorithm guarantees all threads are on the same page
178178
end
179179
# Takes Nc, calcs Mc and Kc
180180
@inline function solve_McKc(::Val{T}, M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T}
181-
W = pick_vector_width(T)
182-
α =* W
183-
β =* W
184-
L₁ₑ = first_cache_size(Val(T)) * R₂
185-
L₂ₑ = second_cache_size(Val(T)) * R₃
181+
W = pick_vector_width(T)
182+
α =* W
183+
β =* W
184+
L₁ₑ = first_cache_size(Val(T)) * R₂
185+
L₂ₑ = second_cache_size(Val(T)) * R₃
186186

187-
Kc_init⁻¹ = Base.FastMath.max_fast(/L₁ₑ), Nc*inv(L₂ₑ))
188-
Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil`
189-
Kblock, Krem = divrem_fast(K, Kiter)
190-
Kblock_Krem = Kblock + One()
187+
Kc_init⁻¹ = Base.FastMath.max_fast(/L₁ₑ), Nc*inv(L₂ₑ))
188+
Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil`
189+
Kblock, Krem = divrem_fast(K, Kiter)
190+
Kblock_Krem = Kblock + One()
191191

192-
Miter_init = cldapproxi(M * inv(L₁ₑ), Kblock_Krem) # Miter = M * Kc / L₁ₑ
193-
Mbsize, Mrem, Mremfinal, Miter = split_m(M, Miter_init, W * Wfactor)
194-
Mblock_Mrem = Mbsize + W * Wfactor
195-
196-
promote(Mbsize, Mblock_Mrem, Mremfinal, Mrem, Miter), promote(Kblock, Kblock_Krem, Krem, Kiter)
192+
Mᵣ = Wfactor * W
193+
Mc_init = floor(Int, Base.FastMath.div_fast(L₁ₑ / Mᵣ, Float64(Kblock_Krem)))
194+
Mc_init_base = max(0, Mc_init - 1)
195+
Kblock_summary = promote(Kblock, Kblock_Krem, Krem, Kiter)
196+
if (Mc_init_base 0) # Mc_init > 1
197+
Mbsize = Mc_init_base * Mᵣ
198+
Mblocks, Mblocks_rem = divrem_fast(M, Mᵣ)
199+
Miter, Mrem = divrem_fast(Mblocks, Mc_init_base)
200+
if Miter == 0
201+
return (0, 0, Int(M)::Int, 0, 1), Kblock_summary
202+
elseif Miter > Mrem
203+
Mblock_Mrem = Mbsize + Mᵣ
204+
Mremfinal = Mbsize + Mblocks_rem
205+
# @show Mbsize * (Miter - 1 - Mrem) + Mrem * Mblock_Mrem + Mremfinal
206+
map(Int, (Mbsize, Mblock_Mrem, Mremfinal, Mrem, Miter)), Kblock_summary
207+
else
208+
_Mbsize, _Mrem, _Mremfinal, _Miter = split_m(M, Miter + (Mrem 0), Mᵣ)
209+
_Mblock_Mrem = _Mbsize + Mᵣ
210+
return map(Int, (_Mbsize, _Mblock_Mrem, _Mremfinal, _Mrem, _Miter)), Kblock_summary
211+
end
212+
else
213+
Mbsize0 = Int(Mᵣ)
214+
Mblock_Mrem0 = Int(Mᵣ)
215+
Miter0, Mremfinal0 = divrem_fast(M, Mᵣ)
216+
map(Int, (Mbsize0, Mblock_Mrem0, Mremfinal0, 0, Miter0)), Kblock_summary
217+
end
197218
end
198219

199220
@inline cldapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.add_fast(Base.FastMath.mul_fast(n, d⁻¹), 0.9999999999999432)) # approximate `ceil`
221+
# @inline divapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.mul_fast(n, d⁻¹)) # approximate `div`
200222

201223
"""
202224
find_first_acceptable(M, W)

src/global_constants.jl

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,10 @@ MᵣW_mul_factor(::True) = StaticInt{4}()
1616
MᵣW_mul_factor(::False) = StaticInt{9}()
1717
MᵣW_mul_factor() = MᵣW_mul_factor(has_feature(Val(:x86_64_avx512f)))
1818

19-
W₁Default(::True) = StaticFloat64{0.009898277594117685}()
20-
# W₁Default(::True) = StaticFloat64{0.0009898277594117685}()
21-
W₂Default(::True) = StaticFloat64{0.9865020832559304}()
22-
R₁Default(::True) = StaticFloat64{0.5820044063603483}()
23-
R₂Default(::True) = StaticFloat64{0.7580885846640107}()
19+
W₁Default(::True) = StaticFloat64{0.0007423708195588264}()
20+
W₂Default(::True) = StaticFloat64{0.7398765624419478}()
21+
R₁Default(::True) = StaticFloat64{0.4697043382682602}()
22+
R₂Default(::True) = StaticFloat64{0.6342912896800855}()
2423

2524
W₁Default_arch(::Val{:znver1}) = StaticFloat64{0.053918949422353986}()
2625
W₂Default_arch(::Val{:znver1}) = StaticFloat64{0.3013238122374886}()

test/matmul_coverage.jl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ m_values = [10, 20, 50, 100, 150, 200]
44

55

66

7-
include("_matmul.jl")
87
typ = get(ENV, "JULIA_TEST_ELTYPE", "ALL")
98
types = if typ == "Float64"
109
DataType[Float64]

test/matmul_main.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ m_values = [200, 300, 400]
55
testset_name_suffix = "(main)"
66

77
for T (Float64,Float32,Int64,Int32)
8-
@time test_complex(T, m_values, k_values, n_values, testset_name_suffix)
98
@time test_real(T, m_values, k_values, n_values, testset_name_suffix)
9+
@time test_complex(T, m_values, k_values, n_values, testset_name_suffix)
1010
end
1111

1212
A = rand(2,2); B = rand(2,2); AB = A*B; C = fill(NaN, 2, 2);

test/runtests.jl

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,10 @@ include("block_sizes.jl")
2323
include("init.jl")
2424
include("integer_division.jl")
2525
include("macrokernels.jl")
26+
include("_matmul.jl")
27+
coverage || include("matmul_main.jl")
2628
include("matmul_coverage.jl")
2729
include("utils.jl")
2830
include("forward_diff.jl")
2931

30-
if !coverage
31-
include("matmul_main.jl")
32-
end
33-
3432
include("aqua.jl") # run the Aqua.jl tests last

0 commit comments

Comments
 (0)