@@ -178,25 +178,47 @@ independently of `M`, this algorithm guarantees all threads are on the same page
178178end
179179# Takes Nc, calcs Mc and Kc
180180@inline function solve_McKc (:: Val{T} , M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T}
181- W = pick_vector_width (T)
182- α = _α * W
183- β = _β * W
184- L₁ₑ = first_cache_size (Val (T)) * R₂
185- L₂ₑ = second_cache_size (Val (T)) * R₃
181+ W = pick_vector_width (T)
182+ α = _α * W
183+ β = _β * W
184+ L₁ₑ = first_cache_size (Val (T)) * R₂
185+ L₂ₑ = second_cache_size (Val (T)) * R₃
186186
187- Kc_init⁻¹ = Base. FastMath. max_fast (√ (α/ L₁ₑ), Nc* inv (L₂ₑ))
188- Kiter = cldapproxi (K, Kc_init⁻¹) # approximate `ceil`
189- Kblock, Krem = divrem_fast (K, Kiter)
190- Kblock_Krem = Kblock + One ()
187+ Kc_init⁻¹ = Base. FastMath. max_fast (√ (α/ L₁ₑ), Nc* inv (L₂ₑ))
188+ Kiter = cldapproxi (K, Kc_init⁻¹) # approximate `ceil`
189+ Kblock, Krem = divrem_fast (K, Kiter)
190+ Kblock_Krem = Kblock + One ()
191191
192- Miter_init = cldapproxi (M * inv (L₁ₑ), Kblock_Krem) # Miter = M * Kc / L₁ₑ
193- Mbsize, Mrem, Mremfinal, Miter = split_m (M, Miter_init, W * Wfactor)
194- Mblock_Mrem = Mbsize + W * Wfactor
195-
196- promote (Mbsize, Mblock_Mrem, Mremfinal, Mrem, Miter), promote (Kblock, Kblock_Krem, Krem, Kiter)
192+ Mᵣ = Wfactor * W
193+ Mc_init = floor (Int, Base. FastMath. div_fast (L₁ₑ / Mᵣ, Float64 (Kblock_Krem)))
194+ Mc_init_base = max (0 , Mc_init - 1 )
195+ Kblock_summary = promote (Kblock, Kblock_Krem, Krem, Kiter)
196+ if (Mc_init_base ≠ 0 ) # Mc_init > 1
197+ Mbsize = Mc_init_base * Mᵣ
198+ Mblocks, Mblocks_rem = divrem_fast (M, Mᵣ)
199+ Miter, Mrem = divrem_fast (Mblocks, Mc_init_base)
200+ if Miter == 0
201+ return (0 , 0 , Int (M):: Int , 0 , 1 ), Kblock_summary
202+ elseif Miter > Mrem
203+ Mblock_Mrem = Mbsize + Mᵣ
204+ Mremfinal = Mbsize + Mblocks_rem
205+ # @show Mbsize * (Miter - 1 - Mrem) + Mrem * Mblock_Mrem + Mremfinal
206+ map (Int, (Mbsize, Mblock_Mrem, Mremfinal, Mrem, Miter)), Kblock_summary
207+ else
208+ _Mbsize, _Mrem, _Mremfinal, _Miter = split_m (M, Miter + (Mrem ≠ 0 ), Mᵣ)
209+ _Mblock_Mrem = _Mbsize + Mᵣ
210+ return map (Int, (_Mbsize, _Mblock_Mrem, _Mremfinal, _Mrem, _Miter)), Kblock_summary
211+ end
212+ else
213+ Mbsize0 = Int (Mᵣ)
214+ Mblock_Mrem0 = Int (Mᵣ)
215+ Miter0, Mremfinal0 = divrem_fast (M, Mᵣ)
216+ map (Int, (Mbsize0, Mblock_Mrem0, Mremfinal0, 0 , Miter0)), Kblock_summary
217+ end
197218end
198219
199220@inline cldapproxi (n, d⁻¹) = Base. fptosi (Int, Base. FastMath. add_fast (Base. FastMath. mul_fast (n, d⁻¹), 0.9999999999999432 )) # approximate `ceil`
221+ # @inline divapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.mul_fast(n, d⁻¹)) # approximate `div`
200222
201223"""
202224 find_first_acceptable(M, W)
0 commit comments