|
| 1 | +using GLMakie |
| 2 | +function flower(n; npetals = 8) |
| 3 | + n = div(n, npetals) |
| 4 | + x = mapreduce(hcat, (1:npetals) .* (2π/npetals)) do θ |
| 5 | + ct = cos(θ) |
| 6 | + st = sin(θ) |
| 7 | + |
| 8 | + x0 = tanh.(randn(1, n) .- 1) .+ 4.0 .+ 0.05.* randn(1, n) |
| 9 | + y0 = randn(1, n) .* 0.3 |
| 10 | + |
| 11 | + x₁ = x0 * cos(θ) .- y0 * sin(θ) |
| 12 | + x₂ = x0 * sin(θ) .+ y0 * cos(θ) |
| 13 | + vcat(x₁, x₂) |
| 14 | + end |
| 15 | + _y = mapreduce(i -> fill(i, n), vcat, 1:npetals) |
| 16 | + y = zeros(npetals, length(_y)) |
| 17 | + foreach(i -> y[_y[i], i] = 1, 1:length(_y)) |
| 18 | + Float32.(x), Float32.(y) |
| 19 | +end |
| 20 | + |
| 21 | +x, y = flower(900) |
| 22 | +scatter(x[1,:], x[2,:], color = mapslices(argmax, y, dims = 1)[:]) |
| 23 | + |
| 24 | +####### |
| 25 | +# Define a Tracked Array for operator overloading AD |
| 26 | +####### |
| 27 | +struct TrackedArray{T,N,V<:AbstractArray{T,N}} <: AbstractArray{T,N} |
| 28 | + value::V |
| 29 | + grad::Union{Nothing,V} |
| 30 | + tape::Vector{Any} |
| 31 | +end |
| 32 | + |
| 33 | +TrackedArray(a::AbstractArray) = TrackedArray(a, similar(a) .= 0, []) |
| 34 | +TrackedMatrix{T,V} = TrackedArray{T,2,V} where {T,V<:AbstractMatrix{T}} |
| 35 | +TrackedVector{T,V} = TrackedArray{T,1,V} where {T,V<:AbstractVector{T}} |
| 36 | +Base.size(a::TrackedArray) = size(a.value) |
| 37 | +Base.show(io::IO, ::MIME"text/plain", a::TrackedArray) = show(io, a) |
| 38 | +Base.show(io::IO, a::TrackedArray) = print(io, "TrackedArray($(size(a.value)))") |
| 39 | +value(A::TrackedArray) = A.value |
| 40 | +resetgrad!(A::TrackedArray) = (A.grad .= 0; empty!(A.tape)) |
| 41 | +value(A) = A |
| 42 | +track(A) = TrackedArray(A) |
| 43 | +track(a::Number) = TrackedArray(reshape([a], 1, 1)) |
| 44 | + |
| 45 | +function accum!(A::TrackedArray) |
| 46 | + isempty(A.tape) && return(A.grad) |
| 47 | + A.grad .= sum(g(accum!(r)) for (r, g) in A.tape) |
| 48 | + empty!(A.tape) |
| 49 | + A.grad |
| 50 | +end |
| 51 | + |
| 52 | +####### |
| 53 | +# Define AD rules for few operations appearing in FFNN |
| 54 | +####### |
| 55 | +import Base: +, * |
| 56 | +import Base.Broadcast: broadcasted |
| 57 | +function *(A::TrackedMatrix, B::TrackedMatrix) |
| 58 | + a, b = value.((A, B)) |
| 59 | + C = track(a * b) |
| 60 | + push!(A.tape, (C, Δ -> Δ * b')) |
| 61 | + push!(B.tape, (C, Δ -> a' * Δ)) |
| 62 | + C |
| 63 | +end |
| 64 | + |
| 65 | +function *(A::TrackedMatrix, B::AbstractMatrix) |
| 66 | + a, b = value.((A, B)) |
| 67 | + C = track(a * b) |
| 68 | + push!(A.tape, (C, Δ -> Δ * b')) |
| 69 | + C |
| 70 | +end |
| 71 | + |
| 72 | +function broadcasted(::typeof(+), A::TrackedMatrix, B::TrackedVector) |
| 73 | + C = track(value(A) .+ value(B)) |
| 74 | + push!(A.tape, (C, Δ -> Δ)) |
| 75 | + push!(B.tape, (C, Δ -> sum(Δ, dims = 2)[:])) |
| 76 | + C |
| 77 | +end |
| 78 | + |
| 79 | +function σ(x::Real) |
| 80 | + t = @fastmath exp(-abs(x)) |
| 81 | + y = ifelse(x ≥ 0, inv(1 + t), t / (1 + t)) |
| 82 | + ifelse(x > 40, one(y), ifelse(x < -80, zero(y), y)) |
| 83 | +end |
| 84 | + |
| 85 | +broadcasted(::typeof(identity), A::TrackedArray) = A |
| 86 | + |
| 87 | +function broadcasted(::typeof(σ), A::TrackedArray) |
| 88 | + Ω = σ.(value(A)) |
| 89 | + C = track(Ω) |
| 90 | + push!(A.tape, (C, Δ -> Δ .* Ω .* (1 .- Ω))) |
| 91 | + C |
| 92 | +end |
| 93 | + |
| 94 | +function mse(A::TrackedMatrix, B::AbstractMatrix) |
| 95 | + n = size(A, 2) |
| 96 | + a = value(A) |
| 97 | + c = similar(a, 1, 1) |
| 98 | + c .= sum((a .- B).^2)/2n |
| 99 | + C = track(c) |
| 100 | + push!(A.tape, (C, Δ -> Δ .* (a .- B) ./ n)) |
| 101 | + C |
| 102 | +end |
| 103 | + |
| 104 | +mse(x::AbstractMatrix, y::AbstractMatrix) = sum((x - y).^2) / (2*size(x,2)) |
| 105 | + |
| 106 | +####### |
| 107 | +# Define a Dense layer |
| 108 | +####### |
| 109 | +struct Dense{F,W,B} |
| 110 | + σ::F |
| 111 | + w::W |
| 112 | + b::B |
| 113 | +end |
| 114 | + |
| 115 | +Base.show(io::IO, m::Dense) = print(io, "Dense($(size(m.w,2)) → $(size(m.w,1)))") |
| 116 | +Dense(i::Int, o::Int, σ = identity) = Dense(σ, randn(Float32, o, i), randn(Float32, o)) |
| 117 | +track(m::Dense) = Dense(m.σ, track(m.w), track(m.b)) |
| 118 | +track(m::ComposedFunction) = track(m.outer) ∘ track(m.inner) |
| 119 | +(m::Dense)(x) = m.σ.(m.w * x .+ m.b) |
| 120 | +params(m::ComposedFunction) = vcat(params(m.outer), params(m.inner)) |
| 121 | +params(m::Dense) = [m.w, m.b] |
| 122 | + |
| 123 | +####### |
| 124 | +# Let's try to actually train a model |
| 125 | +####### |
| 126 | +x, y = flower(900) |
| 127 | +function initmodel() |
| 128 | + m₁ = track(Dense(2, 20, σ)) |
| 129 | + m₂ = track(Dense(20, 20, σ)) |
| 130 | + m₃ = track(Dense(20, size(y,1))) |
| 131 | + m = m₃ ∘ m₂ ∘ m₁ |
| 132 | +end |
| 133 | +m = initmodel() |
| 134 | +m(x) |> value |
| 135 | + |
| 136 | +###### |
| 137 | +# Let's try to learn the parameters |
| 138 | +###### |
| 139 | +α = 0.01 |
| 140 | +ps = params(m) |
| 141 | +@elapsed for i in 1:10000 |
| 142 | + foreach(resetgrad!, ps) |
| 143 | + loss = mse(m(x), y) |
| 144 | + fill!(loss.grad, 1) |
| 145 | + foreach(accum!, ps) |
| 146 | + foreach(x -> x.value .-= α .* x.grad, ps) |
| 147 | + mod(i,250) == 0 && println("loss after $(i) iterations = ", sum(value(loss))) |
| 148 | +end |
| 149 | + |
| 150 | +all(mapslices(argmax, value(m(x)), dims = 1)[:] .== mapslices(argmax, y, dims = 1)[:]) |
| 151 | +scatter(x[1,:], x[2,:], color = mapslices(argmax, value(m(x)), dims = 1)[:]) |
| 152 | + |
| 153 | +###### |
| 154 | +# Let's try to move the computation to GPU |
| 155 | +###### |
| 156 | +using CUDA |
| 157 | +gpu(x::AbstractArray) = CuArray(x) |
| 158 | +gpu(x::TrackedArray) = TrackedArray(CuArray(value(x))) |
| 159 | +gpu(m::Dense) = Dense(m.σ, gpu(m.w), gpu(m.b)) |
| 160 | +gpu(m::ComposedFunction) = gpu(m.outer) ∘ gpu(m.inner) |
| 161 | + |
| 162 | +gx, gy = gpu(x), gpu(y) |
| 163 | +m = gpu(m) |
| 164 | +ps = params(m) |
| 165 | +@elapsed for i in 1:10000 |
| 166 | + foreach(resetgrad!, ps) |
| 167 | + loss = mse(m(gx), gy) |
| 168 | + fill!(loss.grad, 1) |
| 169 | + foreach(accum!, ps) |
| 170 | + foreach(x -> x.value .-= α .* x.grad, ps) |
| 171 | + mod(i,250) == 0 && println("loss after $(i) iterations = ", sum(value(loss))) |
| 172 | +end |
| 173 | + |
| 174 | +####### |
| 175 | +# Why we see a small speed-up? The problem is small |
| 176 | +####### |
| 177 | +using BenchmarkTools |
| 178 | +p = randn(Float32, 20, 2) |
| 179 | +@benchmark $(p) * $(x) |
| 180 | +gp = gpu(p) |
| 181 | +@benchmark $(gp) * $(gx) |
| 182 | + |
| 183 | + |
| 184 | +###### |
| 185 | +# Let's verify the gradients |
| 186 | +###### |
| 187 | +using FiniteDifferences |
| 188 | +ps = [m₃.w, m₃.b, m₂.w, m₂.b, m₁.w, m₁.b] |
| 189 | +map(ps) do p |
| 190 | + foreach(resetgrad!, ps) |
| 191 | + loss = mse(m(x), y) |
| 192 | + fill!(loss.grad, 1) |
| 193 | + foreach(accum!, ps) |
| 194 | + accum!(p) |
| 195 | + θ = deepcopy(value(p)) |
| 196 | + Δθ = deepcopy(p.grad) |
| 197 | + f = θ -> begin |
| 198 | + p.value .= θ |
| 199 | + value(mse(m(x), y)) |
| 200 | + end |
| 201 | + sum(abs2.(grad(central_fdm(5, 1), f, θ)[1] - Δθ)) |
| 202 | +end |
| 203 | + |
| 204 | + |
0 commit comments