Skip to content

Commit ad7f099

Browse files
authored
don't multithread x86 (#146)
* don't multithread x86 * matmul for x86 * prefer chained && over all * forwardiff updates * don't test forward_diff on 32 bit Windows
1 parent db713f3 commit ad7f099

File tree

6 files changed

+150
-52
lines changed

6 files changed

+150
-52
lines changed

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "Octavian"
22
uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
33
authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
4-
version = "0.3.13"
4+
version = "0.3.14"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -16,7 +16,7 @@ ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5"
1616
VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1717

1818
[compat]
19-
ArrayInterface = "3.1.14, 5.0.1"
19+
ArrayInterface = "3.1.14, 5.0.1, 6"
2020
CPUSummary = "0.1.1 - 0.1.8, 0.1.14"
2121
IfElse = "0.1"
2222
LoopVectorization = "0.12.86"

src/forward_diff.jl

Lines changed: 112 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,59 @@
11

22
real_rep(a::AbstractArray{DualT}) where {TAG, T, DualT<:ForwardDiff.Dual{TAG, T}} = reinterpret(reshape, T, a)
3+
_view1(B::AbstractMatrix) = @view(B[1,:])
4+
_view1(B::AbstractArray{<:Any,3}) = @view(B[1,:,:])
5+
36

47
# multiplication of dual vector/matrix by standard matrix from the left
58
function _matmul!(_C::AbstractVecOrMat{DualT}, A::AbstractMatrix, _B::AbstractVecOrMat{DualT},
6-
α=One(), β=Zero(), nthread::Nothing=nothing, MKN=nothing, contig_axis=nothing) where {DualT <: ForwardDiff.Dual}
7-
B = real_rep(_B)
8-
C = real_rep(_C)
9+
α, β=Zero(), nthread::Nothing=nothing, MKN=nothing, contig_axis=nothing) where {DualT <: ForwardDiff.Dual}
10+
B = real_rep(_B)
11+
C = real_rep(_C)
912

10-
@tturbo for n indices((C, B), 3), m indices((C, A), (2, 1)), l in indices((C, B), 1)
11-
Cₗₘₙ = zero(eltype(C))
12-
for k indices((A, B), 2)
13-
Cₗₘₙ += A[m, k] * B[l, k, n]
14-
end
15-
C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
13+
@tturbo for n indices((C, B), 3), m indices((C, A), (2, 1)), l in indices((C, B), 1)
14+
Cₗₘₙ = zero(eltype(C))
15+
for k indices((A, B), 2)
16+
Cₗₘₙ += A[m, k] * B[l, k, n]
1617
end
18+
C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
19+
end
1720

18-
_C
21+
_C
1922
end
2023

2124
# multiplication of dual matrix by standard vector/matrix from the right
2225
@inline function _matmul!(_C::AbstractVecOrMat{DualT}, _A::AbstractMatrix{DualT}, B::AbstractVecOrMat,
2326
α=One(), β=Zero(), nthread::Nothing=nothing, MKN=nothing) where {TAG, T, DualT <: ForwardDiff.Dual{TAG, T}}
24-
if all((ArrayInterface.is_dense(_C), ArrayInterface.is_column_major(_C),
25-
ArrayInterface.is_dense(_A), ArrayInterface.is_column_major(_A)))
26-
# we can avoid the reshape and call the standard method
27-
A = reinterpret(T, _A)
28-
C = reinterpret(T, _C)
29-
_matmul!(C, A, B, α, β, nthread, nothing)
30-
else
31-
# we cannot use the standard method directly
32-
A = real_rep(_A)
33-
C = real_rep(_C)
34-
35-
@tturbo for n indices((C, B), (3, 2)), m indices((C, A), 2), l in indices((C, A), 1)
36-
Cₗₘₙ = zero(eltype(C))
37-
for k indices((A, B), (3, 1))
38-
Cₗₘₙ += A[l, m, k] * B[k, n]
39-
end
40-
C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
41-
end
27+
if Bool(ArrayInterface.is_dense(_C)) && Bool(ArrayInterface.is_column_major(_C)) &&
28+
Bool(ArrayInterface.is_dense(_A)) && Bool(ArrayInterface.is_column_major(_A))
29+
# we can avoid the reshape and call the standard method
30+
A = reinterpret(T, _A)
31+
C = reinterpret(T, _C)
32+
_matmul!(C, A, B, α, β, nthread, nothing)
33+
else
34+
# we cannot use the standard method directly
35+
A = real_rep(_A)
36+
C = real_rep(_C)
37+
38+
@tturbo for n indices((C, B), (3, 2)), m indices((C, A), 2), l in indices((C, A), 1)
39+
Cₗₘₙ = zero(eltype(C))
40+
for k indices((A, B), (3, 1))
41+
Cₗₘₙ += A[l, m, k] * B[k, n]
42+
end
43+
C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
4244
end
45+
end
4346

44-
_C
47+
_C
4548
end
4649

47-
_view1(B::AbstractMatrix) = @view(B[1,:])
48-
_view1(B::AbstractArray{<:Any,3}) = @view(B[1,:,:])
4950
@inline function _matmul!(_C::AbstractVecOrMat{DualT}, _A::AbstractMatrix{DualT}, _B::AbstractVecOrMat{DualT},
50-
α=One(), β=Zero(), nthread::Nothing=nothing, MKN=nothing) where {TAG, T, P, DualT <: ForwardDiff.Dual{TAG, T, P}}
51+
α=One(), β=Zero(), nthread::Nothing=nothing, MKN=nothing, contig=nothing) where {TAG, T, P, DualT <: ForwardDiff.Dual{TAG, T, P}}
5152
A = real_rep(_A)
5253
C = real_rep(_C)
5354
B = real_rep(_B)
54-
if all((ArrayInterface.is_dense(_C), ArrayInterface.is_column_major(_C),
55-
ArrayInterface.is_dense(_A), ArrayInterface.is_column_major(_A)))
55+
if Bool(ArrayInterface.is_dense(_C)) && Bool(ArrayInterface.is_column_major(_C)) &&
56+
Bool(ArrayInterface.is_dense(_A)) && Bool(ArrayInterface.is_column_major(_A))
5657
# we can avoid the reshape and call the standard method
5758
Ar = reinterpret(T, _A)
5859
Cr = reinterpret(T, _C)
@@ -77,3 +78,80 @@ _view1(B::AbstractArray{<:Any,3}) = @view(B[1,:,:])
7778
end
7879
_C
7980
end
81+
82+
83+
# multiplication of dual vector/matrix by standard matrix from the left
84+
function _matmul_serial!(_C::AbstractVecOrMat{DualT}, A::AbstractMatrix, _B::AbstractVecOrMat{DualT},
85+
α, β, MKN) where {DualT <: ForwardDiff.Dual}
86+
B = real_rep(_B)
87+
C = real_rep(_C)
88+
89+
@turbo for n indices((C, B), 3), m indices((C, A), (2, 1)), l in indices((C, B), 1)
90+
Cₗₘₙ = zero(eltype(C))
91+
for k indices((A, B), 2)
92+
Cₗₘₙ += A[m, k] * B[l, k, n]
93+
end
94+
C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
95+
end
96+
97+
_C
98+
end
99+
100+
# multiplication of dual matrix by standard vector/matrix from the right
101+
@inline function _matmul_serial!(_C::AbstractVecOrMat{DualT}, _A::AbstractMatrix{DualT}, B::AbstractVecOrMat,
102+
α, β, MKN) where {TAG, T, DualT <: ForwardDiff.Dual{TAG, T}}
103+
if Bool(ArrayInterface.is_dense(_C)) && Bool(ArrayInterface.is_column_major(_C)) &&
104+
Bool(ArrayInterface.is_dense(_A)) && Bool(ArrayInterface.is_column_major(_A))
105+
# we can avoid the reshape and call the standard method
106+
A = reinterpret(T, _A)
107+
C = reinterpret(T, _C)
108+
_matmul_serial!(C, A, B, α, β, nothing)
109+
else
110+
# we cannot use the standard method directly
111+
A = real_rep(_A)
112+
C = real_rep(_C)
113+
114+
@turbo for n indices((C, B), (3, 2)), m indices((C, A), 2), l in indices((C, A), 1)
115+
Cₗₘₙ = zero(eltype(C))
116+
for k indices((A, B), (3, 1))
117+
Cₗₘₙ += A[l, m, k] * B[k, n]
118+
end
119+
C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
120+
end
121+
end
122+
123+
_C
124+
end
125+
126+
@inline function _matmul_serial!(_C::AbstractVecOrMat{DualT}, _A::AbstractMatrix{DualT}, _B::AbstractVecOrMat{DualT},
127+
α, β, MKN) where {TAG, T, P, DualT <: ForwardDiff.Dual{TAG, T, P}}
128+
A = real_rep(_A)
129+
C = real_rep(_C)
130+
B = real_rep(_B)
131+
if Bool(ArrayInterface.is_dense(_C)) && Bool(ArrayInterface.is_column_major(_C)) &&
132+
Bool(ArrayInterface.is_dense(_A)) && Bool(ArrayInterface.is_column_major(_A))
133+
# we can avoid the reshape and call the standard method
134+
Ar = reinterpret(T, _A)
135+
Cr = reinterpret(T, _C)
136+
_matmul_serial!(Cr, Ar, _view1(B), α, β, nothing)
137+
else
138+
# we cannot use the standard method directly
139+
@turbo for n indices((C, B), 3), m indices((C, A), 2), l in indices((C, A), 1)
140+
Cₗₘₙ = zero(eltype(C))
141+
for k indices((A, B), (3, 2))
142+
Cₗₘₙ += A[l, m, k] * B[1, k, n]
143+
end
144+
C[l, m, n] = α * Cₗₘₙ + β * C[l, m, n]
145+
end
146+
end
147+
Pstatic = static(P)
148+
@turbo for n indices((B,C),3), m indices((A,C),2), p 1:Pstatic
149+
Cₚₘₙ = zero(eltype(C))
150+
for k indices((A,B),(3,2))
151+
Cₚₘₙ += A[1,m,k] * B[p+1,k,n]
152+
end
153+
C[p+1,m,n] = C[p+1,m,n] + α*Cₚₘₙ
154+
end
155+
_C
156+
end
157+

src/matmul.jl

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,16 @@ end
115115
matmul_serial!(C, A, B, One(), Zero(), (M,K,N), ArrayInterface.contiguous_axis(C))
116116
return C
117117
end
118+
@inline function matmul_serial(A::AbstractMatrix, B::AbstractVecOrMat, α)
119+
C, (M,K,N) = alloc_matmul_product(A, B)
120+
matmul_serial!(C, A, B, α, Zero(), (M,K,N), ArrayInterface.contiguous_axis(C))
121+
return C
122+
end
123+
@inline function matmul_serial(A::AbstractMatrix, B::AbstractVecOrMat, α, β)
124+
C, (M,K,N) = alloc_matmul_product(A, B)
125+
matmul_serial!(C, A, B, α, β, (M,K,N), ArrayInterface.contiguous_axis(C))
126+
return C
127+
end
118128

119129

120130
# These methods must be compile time constant
@@ -165,7 +175,7 @@ Otherwise, based on the array's size, whether they are transposed, and whether t
165175
"""
166176
@inline function _matmul_serial!(
167177
C::AbstractMatrix{T}, A::AbstractMatrix, B::AbstractMatrix, α, β, MKN
168-
) where {T}
178+
) where {T <: Base.HWReal}
169179
((β Zero()) && iszero(β)) && return _matmul_serial!(C, A, B, α, Zero(), MKN)
170180
isa Bool) && return _matmul_serial!(C, A, B, α, One(), MKN)
171181
M, K, N = MKN === nothing ? matmul_sizes(C, A, B) : MKN
@@ -216,7 +226,7 @@ function matmul_st_pack_dispatcher!(pC::AbstractStridedPointer{T}, pA, pB, α,
216226
nothing
217227
end
218228

219-
229+
if sizeof(Int) >= 8
220230
"""
221231
matmul(A, B)
222232
@@ -365,9 +375,7 @@ function __matmul!(
365375
else
366376
clamp(div_fast(M * N, StaticInt{256}() * W), 0, _nthread-1)
367377
end
368-
# nkern = cld_fast(M * N, MᵣW * Nᵣ)
369378
threads, torelease = PolyesterWeave.__request_threads(_nrequest % UInt32, PolyesterWeave.worker_pointer(), nothing)
370-
# _threads, _torelease = PolyesterWeave.request_threads(Threads.threadid()%UInt32, _nrequest)
371379

372380
nrequest = threads.i
373381
iszero(nrequest) && @goto SINGLETHREAD
@@ -401,9 +409,6 @@ end
401409

402410
# If tasks is [0,1,2,3] (e.g., `CloseOpen(0,4)`), it will wait on `MULTASKS[i]` for `i = [1,2,3]`.
403411
function waitonmultasks(threads, nthread)
404-
# for (_,tid) ∈ threads
405-
# wait(tid)
406-
# end
407412
(tnum, tuu) = PolyesterWeave.initial_state(threads)
408413
for _ CloseOpen(One(), nthread)
409414
(tnum, tuu) = PolyesterWeave.iter(tnum, tuu)
@@ -524,7 +529,11 @@ function sync_mul!(
524529
end
525530
nothing
526531
end
527-
532+
else
533+
@inline matmul(args::Vararg{Any,K}) where {K} = matmul_serial(args...)
534+
@inline matmul!(args::Vararg{Any,K}) where {K} = matmul_serial!(args...)
535+
end
536+
528537
function _matmul!(y::AbstractVector{T}, A::AbstractMatrix, x::AbstractVector, α, β, _, __) where {T}
529538
@tturbo for m indices((A,y),1)
530539
yₘ = zero(T)

test/_matmul.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ function matmul_pack_ab!(C, A, B)
7878
zc, za, zb = Octavian.zstridedpointer.((C,A,B))
7979
nspawn = min(Threads.nthreads(), Octavian.num_cores())
8080
GC.@preserve C A B begin
81-
if nspawn > 1
81+
if nspawn > 1 && sizeof(Int) >= 8
8282
threads, torelease = Octavian.PolyesterWeave.__request_threads((nspawn-1)%UInt32, Octavian.PolyesterWeave.worker_pointer(), nothing)
8383
@assert threads.i < Threads.nthreads()
8484
Octavian.matmul_pack_A_and_B!(

test/forward_diff.jl

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ randdual(x, ::Val{N}=Val(3)) where {N} = ForwardDiff.Dual(x, ntuple(_ -> randn()
3030
J2 = ForwardDiff.jacobian((C, B) -> LinearAlgebra.mul!(C, A2, B), C2, B2, config)
3131
@test J1 kron(I, A2)
3232
@test J1 J2
33+
34+
J3 = ForwardDiff.jacobian((C, B) -> Octavian.matmul_serial!(C, A1, B), C1, B1, config)
35+
@test J3 kron(I, A1)
36+
@test J3 J2
37+
3338
end
3439

3540
@testset "real array from the right" begin
@@ -38,7 +43,8 @@ randdual(x, ::Val{N}=Val(3)) where {N} = ForwardDiff.Dual(x, ntuple(_ -> randn()
3843

3944
J1 = ForwardDiff.jacobian((C, A) -> Octavian.matmul!(C, A, B1), C1, A1, config)
4045
J2 = ForwardDiff.jacobian((C, A) -> LinearAlgebra.mul!(C, A, B2), C2, A2, config)
41-
@test J1 J2
46+
J3 = ForwardDiff.jacobian((C, A) -> Octavian.matmul_serial!(C, A, B1), C1, A1, config)
47+
@test J1 J2 J3
4248

4349
# transposed arrays
4450
A1new = Matrix(A1')'
@@ -47,7 +53,8 @@ randdual(x, ::Val{N}=Val(3)) where {N} = ForwardDiff.Dual(x, ntuple(_ -> randn()
4753

4854
J1 = ForwardDiff.jacobian((C, A) -> Octavian.matmul!(C, A, B1), C1, A1new, config)
4955
J2 = ForwardDiff.jacobian((C, A) -> LinearAlgebra.mul!(C, A, B2), C2, A2new, config)
50-
@test J1 J2
56+
J3 = ForwardDiff.jacobian((C, A) -> Octavian.matmul_serial!(C, A, B1), C1, A1new, config)
57+
@test J1 J2 J3
5158

5259
# direct version using dual numbers
5360
A1dual = zeros(eltype(config), reverse(size(A1))...)
@@ -56,16 +63,18 @@ randdual(x, ::Val{N}=Val(3)) where {N} = ForwardDiff.Dual(x, ntuple(_ -> randn()
5663

5764
A2dual = deepcopy(A1dual)
5865
C2dual = deepcopy(C1dual)
59-
66+
C3dual = similar(C1dual); C4dual = similar(C2dual)
6067
Octavian.matmul!(C1dual, A1dual', B1)
6168
Octavian.matmul!(C2dual, A2dual', B2)
62-
@test C1dual C2dual
69+
Octavian.matmul_serial!(C3dual, A1dual', B1)
70+
Octavian.matmul_serial!(C4dual, A2dual', B2)
71+
@test C1dual C2dual C3dual C4dual
6372
end
6473

6574
@testset "two dual arrays" begin
6675
A1d = randdual.(A1)
6776
B1d = randdual.(B1)
68-
@test reinterpret(Float64, Octavian.matmul(A1d, B1d, 1.3)) reinterpret(Float64, (A1d * B1d) .* 1.3)
69-
@test reinterpret(Float64, Octavian.matmul(@view(A1d[begin:end-1,:]), B1d)) reinterpret(Float64, @view(A1d[begin:end-1,:]) * B1d)
77+
@test reinterpret(Float64, Octavian.matmul(A1d, B1d, 1.3)) reinterpret(Float64, Octavian.matmul_serial(A1d, B1d, 1.3)) reinterpret(Float64, (A1d * B1d) .* 1.3)
78+
@test reinterpret(Float64, Octavian.matmul(@view(A1d[begin:end-1,:]), B1d)) reinterpret(Float64, Octavian.matmul_serial(@view(A1d[begin:end-1,:]), B1d)) reinterpret(Float64, @view(A1d[begin:end-1,:]) * B1d)
7079
end
7180
end

test/runtests.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ include("_matmul.jl")
3434
coverage || include("matmul_main.jl")
3535
include("matmul_coverage.jl")
3636
include("utils.jl")
37-
include("forward_diff.jl")
37+
if sizeof(Int) >= 8 || !Sys.iswindows()
38+
include("forward_diff.jl")
39+
end
3840

3941
include("aqua.jl") # run the Aqua.jl tests last

0 commit comments

Comments
 (0)