|
| 1 | +################################################################################ |
| 2 | +# Distance matrix |
| 3 | +################################################################################ |
| 4 | +""" |
| 5 | + distancematrix(x [, y = x], metric = "euclidean") |
| 6 | +
|
| 7 | +Create a matrix with the distances between each pair of points of the |
| 8 | +time series `x` and `y` using `metric`. |
| 9 | +
|
| 10 | +The time series `x` and `y` can be `Dataset`s or vectors or matrices with data points |
| 11 | +in rows. |
| 12 | +The data point dimensions (or number of columns) must be the same for `x` and `y`. |
| 13 | +The returned value is a `n×m` matrix, with `n` being the length (or number of rows) |
| 14 | +of `x`, and `m` the length of `y`. |
| 15 | +
|
| 16 | +The metric can be identified by a string, or any of the `Metric`s defined in |
| 17 | +the [`Distances` package](https://github.com/JuliaStats/Distances.jl). |
| 18 | +The list of strings available to define the metric are: |
| 19 | +
|
| 20 | +* `"max"` or `"inf"` for the maximum or L∞ norm |
| 21 | + (`Chebyshev()` in the `Distances` package). |
| 22 | +* `"euclidean"` for the L2 or Euclidean norm, used by default |
| 23 | + (`Euclidean()` in `Distances`). |
| 24 | +* `"manhattan"`, `"cityblock"`, `"taxicab"` or `"min"` for the Manhattan or L1 norm |
| 25 | + (`Cityblock()` in `Distances`). |
| 26 | +""" |
| 27 | +distancematrix(x, metric::Union{Metric,String}=DEFAULT_METRIC, parallel = size(x)[1] > 500) = _distancematrix(x, getmetric(metric), Val(parallel)) |
| 28 | + |
| 29 | +# For 1-dimensional arrays (vectors), the distance does not depend on the metric |
| 30 | +distancematrix(x::Vector, y::Vector, metric=DEFAULT_METRIC, parallel = size(x)[1] > 500) = abs.(x .- y') |
| 31 | + |
| 32 | +# If the metric is supplied as a string, get the corresponding Metric from Distances |
| 33 | +distancematrix(x, y, metric::String, parallel = size(x)[1] > 500) = distancematrix(x, y, getmetric(metric), parallel) |
| 34 | + |
| 35 | +const MAXDIM = 9 |
| 36 | +function distancematrix(x::Tx, y::Ty, metric::Metric=DEFAULT_METRIC, parallel = size(x)[1] > 500) where |
| 37 | + {Tx<:Union{AbstractMatrix, Dataset}} where {Ty<:Union{AbstractMatrix, Dataset}} |
| 38 | + sx, sy = size(x), size(y) |
| 39 | + @assert sx[2] == sy[2] """ |
| 40 | + The dimensions of the data points in `x` and `y` must be equal! |
| 41 | + Found dim(x)=$(sx[2]), dim(y)=$(sy[2]). |
| 42 | + """ |
| 43 | + |
| 44 | + if sx[2] ≥ MAXDIM && typeof(metric) == Euclidean # Blas optimization |
| 45 | + return _distancematrix(Matrix(x), Matrix(y), metric, Val(parallel)) |
| 46 | + elseif Tx <: Matrix && Ty <: Matrix && metric == Chebyshev() |
| 47 | + return _distancematrix(x, y, metric, Val(parallel)) |
| 48 | + else |
| 49 | + return _distancematrix(Dataset(x), Dataset(y), metric, Val(parallel)) |
| 50 | + end |
| 51 | +end |
| 52 | + |
| 53 | +# Core function for Matrices (wrapper of `pairwise` from the Distances package) |
| 54 | +_distancematrix(x::AbstractMatrix, y::AbstractMatrix, metric::Metric, ::Val{false}) = pairwise(metric, x', y', dims=2) |
| 55 | + |
| 56 | +# First we define the serial versions of the functions. |
| 57 | +# Core function for Datasets |
| 58 | +function _distancematrix(x::Dataset{S,Tx}, y::Dataset{S,Ty}, |
| 59 | + metric::Metric, ::Val{false}) where {S, Tx, Ty} |
| 60 | + |
| 61 | + x = x.data |
| 62 | + y = y.data |
| 63 | + d = zeros(promote_type(Tx,Ty), length(x), length(y)) |
| 64 | + for j in 1:length(y) |
| 65 | + for i in 1:length(x) |
| 66 | + @inbounds d[i,j] = evaluate(metric, x[i], y[j]) |
| 67 | + end |
| 68 | + end |
| 69 | + return d |
| 70 | +end |
| 71 | + |
| 72 | +# Now, we define the parallel versions. |
| 73 | + |
| 74 | +function _distancematrix(x::Dataset{S,Tx}, y::Dataset{S,Ty}, |
| 75 | + metric::Metric, ::Val{true}) where {S, Tx, Ty} |
| 76 | + |
| 77 | + x = x.data |
| 78 | + y = y.data |
| 79 | + d = zeros(promote_type(Tx,Ty), length(x), length(y)) |
| 80 | + Threads.@threads for j in 1:length(y) |
| 81 | + for i in 1:length(x) |
| 82 | + @inbounds d[i,j] = evaluate(metric, x[i], y[j]) |
| 83 | + end |
| 84 | + end |
| 85 | + return d |
| 86 | +end |
| 87 | + |
| 88 | +function _distancematrix(x::Matrix{Tx}, y::Matrix{Ty}, |
| 89 | + metric::Metric, ::Val{true}) where {Tx, Ty} |
| 90 | + |
| 91 | + x = x.data |
| 92 | + y = y.data |
| 93 | + d = zeros(promote_type(Tx,Ty), length(x), length(y)) |
| 94 | + Threads.@threads for j in 1:length(y) |
| 95 | + for i in 1:length(x) |
| 96 | + @inbounds d[i,j] = evaluate(metric, x[i, :], y[j, :]) |
| 97 | + end |
| 98 | + end |
| 99 | + return d |
| 100 | +end |
| 101 | + |
| 102 | + |
| 103 | +# Now, we define methods for the single trajectory. |
| 104 | +# We can speed this up by only calculating the lower triangle, |
| 105 | +# which halves the number of computations needed. |
| 106 | + |
| 107 | +# Again, we'll define the serial version first: |
| 108 | +function _distancematrix(x::Vector{T}, metric::Metric, ::Val{false}) where T |
| 109 | + d = zeros(T, length(x), length(x)) |
| 110 | + |
| 111 | + for j in 1:length(x) |
| 112 | + for i in 1:j |
| 113 | + @inbounds d[i, j] = abs(x[i] - x[j]) |
| 114 | + end |
| 115 | + end |
| 116 | + |
| 117 | + return Symmetric(d, :U) |
| 118 | + |
| 119 | +end |
| 120 | + |
| 121 | +function _distancematrix(x::Dataset{S, T}, metric::Metric, ::Val{false}) where T where S |
| 122 | + d = zeros(T, length(x), length(x)) |
| 123 | + |
| 124 | + for j in 1:length(x) |
| 125 | + for i in 1:j |
| 126 | + @inbounds d[i, j] = evaluate(metric, x[i], x[j]) |
| 127 | + end |
| 128 | + end |
| 129 | + |
| 130 | + return Symmetric(d, :U) |
| 131 | + |
| 132 | +end |
| 133 | + |
| 134 | +# Now, we define the parallel version. There's a twist, though. |
| 135 | + |
| 136 | +# The single trajectory case is a little tricky. If you try it naively, |
| 137 | +# using the method we used for the serial computation above, the points are |
| 138 | +# partioned out unequally between threads. This means that performance actually |
| 139 | +# **decreases** compared to the full parallel version. To mitigate this, we need |
| 140 | +# to partition the computation equally among all threads. |
| 141 | +# The function I've written below does essentially this - given a length, |
| 142 | +# it calculates the appropriate partitioning, and returns a list of indices, |
| 143 | +# by utilizing the fact that the area is proportional to the square of the height. |
| 144 | +# It partitions the "triangle" which needs to be computed into "trapezoids", |
| 145 | +# which all have an equal area. |
| 146 | +function partition_indices(len) |
| 147 | + indices = Vector{UnitRange{Int}}(undef, Threads.nthreads()) |
| 148 | + length = len |
| 149 | + offset = 1 |
| 150 | + |
| 151 | + # Here, we have to iterate in reverse, to get an equal area every time |
| 152 | + for n in Threads.nthreads():-1:1 |
| 153 | + partition = round(Int, length / sqrt(n)) # length varies as square root of area |
| 154 | + indices[n] = offset:(partition .+ offset .- 1) |
| 155 | + length -= partition |
| 156 | + offset += partition |
| 157 | + end |
| 158 | + |
| 159 | + return indices |
| 160 | +end |
| 161 | + |
| 162 | +# And now for the actual implementation: |
| 163 | +function _distancematrix(x::Vector{T}, metric::Metric, ::Val{true}) where T |
| 164 | + d = zeros(T, length(x), length(x)) |
| 165 | + |
| 166 | + Threads.@threads for k in partition_indices(length(x)) |
| 167 | + for j in k |
| 168 | + for i in 1:j |
| 169 | + @inbounds d[i, j] = abs(x[i] - x[j]) |
| 170 | + end |
| 171 | + end |
| 172 | + end |
| 173 | + |
| 174 | + return Symmetric(d, :U) |
| 175 | + |
| 176 | +end |
| 177 | + |
| 178 | +function _distancematrix(x::Dataset{S, T}, metric::Metric, ::Val{true}) where T where S |
| 179 | + d = zeros(T, length(x), length(x)) |
| 180 | + |
| 181 | + Threads.@threads for k in partition_indices(length(x)) |
| 182 | + for j in k |
| 183 | + for i in 1:j |
| 184 | + @inbounds d[i, j] = evaluate(metric, x[i], x[j]) |
| 185 | + end |
| 186 | + end |
| 187 | + end |
| 188 | + |
| 189 | + return Symmetric(d, :U) |
| 190 | + |
| 191 | +end |
0 commit comments