JuliaDynamics · Datseris · Sep 26, 2022 · Sep 19, 2022 · Sep 19, 2022 · Sep 19, 2022
diff --git a/docs/src/probabilities.md b/docs/src/probabilities.md
@@ -24,6 +24,13 @@ SymbolicPermutation
 SpatialSymbolicPermutation
 ```
 
+## Dispersion (symbolic)
+
+```@docs
+Dispersion
+ReverseDispersion
+```
+
 ## Visitation frequency (binning)
 
 ```@docs

diff --git a/src/entropies/direct_entropies/entropy_dispersion.jl b/src/entropies/direct_entropies/entropy_dispersion.jl
diff --git a/src/entropies/entropies.jl b/src/entropies/entropies.jl
@@ -3,6 +3,5 @@ include("tsallis.jl")
 include("shannon.jl")
 include("convenience_definitions.jl")
 include("direct_entropies/nearest_neighbors/nearest_neighbors.jl")
-include("direct_entropies/entropy_dispersion.jl")
 
 # TODO: What else is included here from direct entropies?
diff --git a/src/probabilities_estimators/dispersion/dispersion.jl b/src/probabilities_estimators/dispersion/dispersion.jl
@@ -0,0 +1,110 @@
+using DelayEmbeddings
+using StatsBase
+
+export Dispersion
+
+"""
+    Dispersion(; s = GaussianSymbolization(5), m = 2, τ = 1, check_unique = true,
+        normalize = true)
+
+A probability estimator based on dispersion patterns, originally used by
+Rostaghi & Azami, 2016[^Rostaghi2016] to compute the "dispersion entropy", which
+characterizes the complexity and irregularity of a time series.
+
+Relative frequencies of dispersion patterns are computed using the symbolization scheme
+`s` with embedding dimension `m` and embedding delay `τ`. Recommended parameter
+values[^Li2018] are `m ∈ [2, 3]`, `τ = 1`, and `n_categories ∈ [3, 4, …, 8]` for the
+Gaussian mapping (defaults to 5).
+
+If `normalize == true`, then when used in combination with [`renyi_entropy`](@ref)
+(see below), the the dispersion entropy is normalized to `[0, 1]`.
+
+If `check_unique == true` (default), then it is checked that the input has
+more than one unique value. If `check_unique == false` and the input only has one
+unique element, then a `InexactError` is thrown when trying to compute probabilities.
+
+## Probabilities vs dispersion entropy
+
+The original dispersion entropy paper does not discuss the technique as a probability
+estimator per se, but does require a step where probabilities over dispersion patterns
+are explicitly computed. Hence, we provide `Dispersion` as a probability estimator.
+
+The `Dispersion` estimator can be used both to compute probability distributions over
+dispersion patterns directly. To compute (normalized) dispersion entropy of order `q` to a
+given `base` on the univariate input time series `x` by calling
+
+```julia
+renyi_entropy(x, Dispersion(normalize = true), base = base, q = q)
+```
+
+## Data requirements
+
+The input must have more than one unique element for the Gaussian mapping to be
+well-defined. Li et al. (2018) recommends that `x` has at least 1000 data points.
+
+[^Rostaghi2016]: Rostaghi, M., & Azami, H. (2016). Dispersion entropy: A measure for time-series analysis. IEEE Signal Processing Letters, 23(5), 610-614.
+[^Li2018]: Li, G., Guan, Q., & Yang, H. (2018). Noise reduction method of underwater acoustic signals based on CEEMDAN, effort-to-compress complexity, refined composite multiscale dispersion entropy and wavelet threshold denoising. Entropy, 21(1), 11.
+"""
+Base.@kwdef struct Dispersion <: ProbabilitiesEstimator
+    s = GaussianSymbolization(n_categories = 5)
+    m = 2
+    τ = 1
+    check_unique = false
+    normalize = true
+end
+
+export entropy_dispersion
+
+"""
+    embed_symbols(s::AbstractVector{T}, m, τ) {where T} → Dataset{m, T}
+
+From the symbols `sᵢ ∈ s`, create the embedding vectors (with dimension `m` and lag `τ`):
+
+```math
+s_i^D = \\{s_i, s_{i+\\tau}, \\ldots, s_{i+(m-1)\\tau} \\}
+```,
+
+where ``i = 1, 2, \\ldots, N - (m - 1)\\tau`` and `N = length(s)`.
+"""
+function embed_symbols(symbols::AbstractVector, m, τ)
+    return embed(symbols, m, τ)
+end
+
+function dispersion_histogram(x::AbstractDataset, N, m, τ)
+    return _non0hist(x.data, (N - (m - 1)*τ))
+end
+
+function probabilities(x::AbstractVector, est::Dispersion)
+    if est.check_unique
+        if length(unique(x)) == 1
+            symbols = repeat([1], length(x))
+        else
+            symbols = symbolize(x, est.s)
+        end
+    else
+        symbols = symbolize(x, est.s)
+    end
+    N = length(x)
+
+    # We must use genembed, not embed, to make sure the zero lag is included
+    m, τ = est.m, est.τ
+    τs = tuple((x for x in 0:-τ:-(m-1)*τ)...)
+    dispersion_patterns = genembed(symbols, τs, ones(m))
+
+    𝐩 = Probabilities(dispersion_histogram(dispersion_patterns, N, est.m, est.τ))
+end
+
+function renyi_entropy(x::AbstractVector;
+        s::GaussianSymbolization = GaussianSymbolization(n_categories = 5),
+        m = 2, τ = 1, q = 1, base = MathConstants.e)
+    est = Dispersion(m = m, τ = τ, s = s, check_unique = true)
+    𝐩 = probabilities(x, est)
+
+    dh = renyi_entropy(𝐩, q = q, base = base)
+
+    if est.normalize
+        return dh / log(base, s.n_categories^m)
+    else
+        return dh
+    end
+end
diff --git a/src/probabilities_estimators/dispersion/dispersion_reverse.jl b/src/probabilities_estimators/dispersion/dispersion_reverse.jl
@@ -0,0 +1,55 @@
+export ReverseDispersion
+export entropy_reverse_dispersion
+export distance_to_whitenoise
+
+"""
+    ReverseDispersion(; s = GaussianSymbolization(5), m = 2, τ = 1, check_unique = true,
+        normalize = true)
+
+A probability estimator that in Rostaghi & Azami, 2016[^Rostaghi2016] is used to compute the
+"reverse dispersion entropy", which measures how far from being white noise a signal is.
+
+For computing probabilities, [`ReverseDispersion`](@ref) is equivalent to
+[`Dispersion`](@ref), and parameters have the same meaning. For computing entropies,
+however, [`ReverseDispersion`](@ref) adds an additional step consisting of computation and
+normalization of distances from white noise (see Li et al. 2019 for details).
+
+See also: [`Dispersion`](@ref).
+
+[^Li2019]: Li, Y., Gao, X., & Wang, L. (2019). Reverse dispersion entropy: a new
+    complexity measure for sensor signal. Sensors, 19(23), 5203.
+"""
+Base.@kwdef struct ReverseDispersion <: ProbabilitiesEstimator
+    s = GaussianSymbolization(n_categories = 5)
+    m = 2
+    τ = 1
+    check_unique = false
+    normalize = true
+end
+
+function probabilities(x::AbstractVector, est::ReverseDispersion)
+    probabilities(x, Dispersion(m = est.m, τ = est.τ, s = est.s,
+        check_unique = est.check_unique, normalize = est.normalize))
+end
+
+
+function distance_to_whitenoise(𝐩::Probabilities, N, m)
+    # We can safely skip non-occurring symbols, because they don't contribute
+    # to the sum in eq. 3 in Li et al. (2019)
+    return sum(𝐩[i]^2 for i in eachindex(𝐩)) - 1/(N^m)
+end
+
+function renyi_entropy(x::AbstractVector{T}, est::ReverseDispersion;
+        q = 1, base = 2) where T <: Real
+
+    𝐩 = probabilities(x, est)
+    Hrde = distance_to_whitenoise(𝐩, s.n_categories, est.m)
+
+    if normalize
+        # The factor `f` considers *all* possible symbols (also non-occurring)
+        f = s.n_categories^m
+        return Hrde / (1 - (1/f))
+    else
+        return Hrde
+    end
+end
diff --git a/src/probabilities_estimators/probabilities_estimators.jl b/src/probabilities_estimators/probabilities_estimators.jl
@@ -3,4 +3,6 @@ include("rectangular_binning/rectangular_binning.jl")
 include("permutation_ordinal/symbolic.jl")
 include("kernel_density.jl")
 include("timescales/timescales.jl")
-include("transfer_operator/transfer_operator.jl")
+include("transfer_operator/transfer_operator.jl")
+include("dispersion/dispersion.jl")
+include("dispersion/dispersion_reverse.jl")
diff --git a/test/dispersion.jl b/test/dispersion.jl
@@ -0,0 +1,50 @@
+@testset "Dispersion methods" begin
+    x = rand(100)
+
+    @testset "Dispersion" begin
+        est = Dispersion()
+
+        @testset "Internals" begin
+            # Li et al. (2018) recommends using at least 1000 data points when estimating
+            # dispersion entropy.
+            x = rand(1000)
+            n_categories = 4
+            m = 4
+            τ = 1
+            s = GaussianSymbolization(n_categories = n_categories)
+
+            # Symbols should be in the set [1, 2, …, n_categories].
+            symbols = Entropies.symbolize(x, s)
+            @test all([s ∈ collect(1:n_categories) for s in symbols])
+
+            # Dispersion patterns should have a normalized histogram that sums to 1.0.
+            dispersion_patterns = DelayEmbeddings.embed(symbols, m, τ)
+            hist = Entropies.dispersion_histogram(dispersion_patterns, length(x), m, τ)
+            @test sum(hist) ≈ 1.0
+        end
+
+        ps = probabilities(x, est)
+        @test ps isa Probabilities
+
+        de = entropy_renyi(x, est, q = 1, base = 2, normalize = false)
+        de_norm = entropy_renyi(x, est, q = 1, base = 2, normalize = true)
+        @test de isa Real
+        @test de_norm isa Real
+        @test de >= 0.0
+        @test de_norm >= 0.0
+    end
+
+    @testset "ReverseDispersion" begin
+        est = ReverseDispersion()
+        ps = probabilities(x, est)
+        @test ps isa Probabilities
+
+        de = entropy_renyi(x, est, q = 1, base = 2, normalize = false)
+        de_norm = entropy_renyi(x, est, q = 1, base = 2, normalize = true)
+        @test de isa Real
+        @test de_norm isa Real
+        @test de >= 0.0
+        @test de_norm >= 0.0
+    end
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -8,8 +8,9 @@ using Neighborhood: KDTree, BruteForce
 # TODO: This is how the tests should look like in the end:
 defaultname(file) = splitext(basename(file))[1]
 testfile(file, testname=defaultname(file)) = @testset "$testname" begin; include(file); end
-@testset "Entopies.jl" begin
+@testset "Entropies.jl" begin
     testfile("timescales.jl")
+    testfile("dispersion.jl")
 end
 
 @testset "Histogram estimation" begin
@@ -325,29 +326,6 @@ end
         end
     end
 
-    @testset "Dispersion entropy" begin
-        # Li et al. (2018) recommends using at least 1000 data points when estimating
-        # dispersion entropy.
-        x = rand(1000)
-        n_categories = 4
-        m = 4
-        τ = 1
-        s = GaussianSymbolization(n_categories = n_categories)
-
-        # Symbols should be in the set [1, 2, …, n_categories].
-        symbols = Entropies.symbolize(x, s)
-        @test all([s ∈ collect(1:n_categories) for s in symbols])
-
-        # Dispersion patterns should have a normalized histogram that sums to 1.0.
-        dispersion_patterns = DelayEmbeddings.embed(symbols, m, τ)
-        hist = Entropies.dispersion_histogram(dispersion_patterns, length(x), m, τ)
-        @test sum(hist) ≈ 1.0
-
-        de = entropy_dispersion(x, s, m = 4, τ = 1)
-        @test typeof(de) <: Real
-        @test de >= 0.0
-    end
-
     @testset "Tsallis" begin
         p = Probabilities(repeat([1/5], 5))
         @assert round(entropy_tsallis(p, q = -1/2, k = 1), digits = 2) ≈ 6.79