Adjust blocking behavior to cap cache use at L2e and L3e (#126)

chriselrod · web-flow · commit b954b9b954eb · 2021-12-05T16:23:13.000-05:00
* Adjust blocking behavior to cap cache use at L2e and L3e

* Update tilesearch and a few params
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Octavian"
 uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
-version = "0.3.9"
+version = "0.3.10"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
diff --git a/benchmark/tilesearch.jl b/benchmark/tilesearch.jl
@@ -5,12 +5,14 @@ function matmul_pack_ab!(C, A, B, ::Val{W₁}, ::Val{W₂}, ::Val{R₁}, ::Val{R
   M, N = size(C); K = size(B,1)
   zc, za, zb = Octavian.zstridedpointer.((C,A,B))
   nspawn = VectorizationBase.num_cores()
+  threads, torelease = Octavian.PolyesterWeave.__request_threads((nspawn-1)%UInt32, Octavian.PolyesterWeave.worker_pointer(), nothing)
   t = Inf
   GC.@preserve C A B begin
     for _ ∈ 1:2
-      t = min(t, @elapsed(Octavian.matmul_pack_A_and_B!(zc, za, zb, Octavian.One(), Octavian.Zero(), M, K, N, Int(nspawn), F64(W₁), F64(W₂), F64(R₁), F64(R₂))))
+      t = min(t, @elapsed(Octavian.matmul_pack_A_and_B!(zc, za, zb, Octavian.One(), Octavian.Zero(), M, K, N, threads, F64(W₁), F64(W₂), F64(R₁), F64(R₂))))
     end
   end
+  Octavian.PolyesterWeave.free_threads!(torelease)
   return t
 end
 
@@ -119,15 +121,16 @@ using Optim
 hours = 60.0*60.0; days = 24hours;
 init = Float64[Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default()]
 lower = 0.75 .* init;
-upper = [1.25init[1], 1.25init[2], 0.75*init[3] + 0.25, 0.75*init[4] + 0.25];
+# upper = [1.25init[1], 1.25init[2], 0.75*init[3] + 0.25, 0.75*init[4] + 0.25];
+upper = [0.9, 1.25init[2], 0.999, 0.999];
 # init = [0.001, 0.9754033943603924, 0.5711159869399494, 0.7547361860432168];
 
+#=
 opt = Optim.optimize(
     matmul_objective, init, ParticleSwarm(lower = lower, upper = upper),
     Optim.Options(iterations = 10^6, time_limit = 8hours)
 );
-
-
+=#
 
 
 
diff --git a/src/block_sizes.jl b/src/block_sizes.jl
@@ -178,25 +178,47 @@ independently of `M`, this algorithm guarantees all threads are on the same page
 end
 # Takes Nc, calcs Mc and Kc
 @inline function solve_McKc(::Val{T}, M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T}
-    W = pick_vector_width(T)
-    α = _α * W
-    β = _β * W
-    L₁ₑ =  first_cache_size(Val(T)) * R₂
-    L₂ₑ = second_cache_size(Val(T)) * R₃
+  W = pick_vector_width(T)
+  α = _α * W
+  β = _β * W
+  L₁ₑ =  first_cache_size(Val(T)) * R₂
+  L₂ₑ = second_cache_size(Val(T)) * R₃
 
-    Kc_init⁻¹ = Base.FastMath.max_fast(√(α/L₁ₑ), Nc*inv(L₂ₑ))
-    Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil`
-    Kblock, Krem = divrem_fast(K, Kiter)
-    Kblock_Krem = Kblock + One()
+  Kc_init⁻¹ = Base.FastMath.max_fast(√(α/L₁ₑ), Nc*inv(L₂ₑ))
+  Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil`
+  Kblock, Krem = divrem_fast(K, Kiter)
+  Kblock_Krem = Kblock + One()
 
-    Miter_init = cldapproxi(M * inv(L₁ₑ), Kblock_Krem) # Miter = M * Kc / L₁ₑ
-    Mbsize, Mrem, Mremfinal, Miter = split_m(M, Miter_init, W * Wfactor)
-    Mblock_Mrem = Mbsize + W * Wfactor
-    
-    promote(Mbsize, Mblock_Mrem, Mremfinal, Mrem, Miter), promote(Kblock, Kblock_Krem, Krem, Kiter)
+  Mᵣ = Wfactor * W
+  Mc_init = floor(Int, Base.FastMath.div_fast(L₁ₑ / Mᵣ, Float64(Kblock_Krem)))
+  Mc_init_base = max(0, Mc_init - 1)
+  Kblock_summary = promote(Kblock, Kblock_Krem, Krem, Kiter)
+  if (Mc_init_base ≠ 0) # Mc_init > 1
+    Mbsize = Mc_init_base * Mᵣ
+    Mblocks, Mblocks_rem = divrem_fast(M, Mᵣ)
+    Miter, Mrem = divrem_fast(Mblocks, Mc_init_base)
+    if Miter == 0
+       return (0, 0, Int(M)::Int, 0, 1), Kblock_summary
+    elseif Miter > Mrem
+      Mblock_Mrem = Mbsize + Mᵣ
+      Mremfinal = Mbsize + Mblocks_rem
+      # @show Mbsize * (Miter - 1 - Mrem) + Mrem * Mblock_Mrem + Mremfinal
+      map(Int, (Mbsize, Mblock_Mrem, Mremfinal, Mrem, Miter)), Kblock_summary
+    else
+      _Mbsize, _Mrem, _Mremfinal, _Miter = split_m(M, Miter + (Mrem ≠ 0), Mᵣ)
+      _Mblock_Mrem = _Mbsize + Mᵣ
+      return map(Int, (_Mbsize, _Mblock_Mrem, _Mremfinal, _Mrem, _Miter)), Kblock_summary
+    end
+  else
+    Mbsize0 = Int(Mᵣ)
+    Mblock_Mrem0 = Int(Mᵣ)
+    Miter0, Mremfinal0 = divrem_fast(M, Mᵣ)
+    map(Int, (Mbsize0, Mblock_Mrem0, Mremfinal0, 0, Miter0)), Kblock_summary
+  end
 end
 
 @inline cldapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.add_fast(Base.FastMath.mul_fast(n, d⁻¹), 0.9999999999999432)) # approximate `ceil`
+# @inline divapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.mul_fast(n, d⁻¹)) # approximate `div`
 
 """
   find_first_acceptable(M, W)
diff --git a/src/global_constants.jl b/src/global_constants.jl
@@ -16,11 +16,10 @@ MᵣW_mul_factor(::True) = StaticInt{4}()
 MᵣW_mul_factor(::False) = StaticInt{9}()
 MᵣW_mul_factor() = MᵣW_mul_factor(has_feature(Val(:x86_64_avx512f)))
 
-W₁Default(::True) = StaticFloat64{0.009898277594117685}()
-# W₁Default(::True) = StaticFloat64{0.0009898277594117685}()
-W₂Default(::True) = StaticFloat64{0.9865020832559304}()
-R₁Default(::True) = StaticFloat64{0.5820044063603483}()
-R₂Default(::True) = StaticFloat64{0.7580885846640107}()
+W₁Default(::True) = StaticFloat64{0.0007423708195588264}()
+W₂Default(::True) = StaticFloat64{0.7398765624419478}()
+R₁Default(::True) = StaticFloat64{0.4697043382682602}()
+R₂Default(::True) = StaticFloat64{0.6342912896800855}()
 
 W₁Default_arch(::Val{:znver1}) = StaticFloat64{0.053918949422353986}()
 W₂Default_arch(::Val{:znver1}) = StaticFloat64{0.3013238122374886}()
diff --git a/test/matmul_coverage.jl b/test/matmul_coverage.jl
@@ -4,7 +4,6 @@ m_values  = [10, 20, 50, 100, 150, 200]
 
 
 
-include("_matmul.jl")
 typ = get(ENV, "JULIA_TEST_ELTYPE", "ALL")
 types = if typ == "Float64"
   DataType[Float64]
diff --git a/test/matmul_main.jl b/test/matmul_main.jl
@@ -5,8 +5,8 @@ m_values  = [200, 300, 400]
 testset_name_suffix = "(main)"
 
 for T ∈ (Float64,Float32,Int64,Int32)
-  @time test_complex(T, m_values, k_values, n_values, testset_name_suffix)
   @time test_real(T, m_values, k_values, n_values, testset_name_suffix)
+  @time test_complex(T, m_values, k_values, n_values, testset_name_suffix)
 end
 
 A = rand(2,2); B = rand(2,2); AB = A*B; C = fill(NaN, 2, 2);
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -23,12 +23,10 @@ include("block_sizes.jl")
 include("init.jl")
 include("integer_division.jl")
 include("macrokernels.jl")
+include("_matmul.jl")
+coverage || include("matmul_main.jl")
 include("matmul_coverage.jl")
 include("utils.jl")
 include("forward_diff.jl")
 
-if !coverage
-    include("matmul_main.jl")
-end
-
 include("aqua.jl") # run the Aqua.jl tests last