JuliaDynamics · Datseris · Sep 26, 2022 · Sep 19, 2022 · Sep 19, 2022 · Sep 19, 2022
diff --git a/docs/src/complexity_measures.md b/docs/src/complexity_measures.md
@@ -4,4 +4,10 @@
 
 ## Approximate entropy
 
+## Reverse dispersion entropy
+
+```@docs
+reverse_dispersion
+```
+
 ## Disequilibrium
diff --git a/docs/src/probabilities.md b/docs/src/probabilities.md
@@ -24,6 +24,12 @@ SymbolicPermutation
 SpatialSymbolicPermutation
 ```
 
+## Dispersion (symbolic)
+
+```@docs
+Dispersion
+```
+
 ## Visitation frequency (binning)
 
 ```@docs

diff --git a/src/Entropies.jl b/src/Entropies.jl
@@ -16,6 +16,8 @@ include("symbolization/symbolize.jl")
 include("probabilities.jl")
 include("probabilities_estimators/probabilities_estimators.jl")
 include("entropies/entropies.jl")
+include("complexity_measures/complexity_measures.jl")
+
 include("deprecations.jl")
 
 

diff --git a/src/complexity_measures/complexity_measures.jl b/src/complexity_measures/complexity_measures.jl
@@ -0,0 +1 @@
+include("reverse_dispersion_entropy.jl")
diff --git a/src/complexity_measures/reverse_dispersion_entropy.jl b/src/complexity_measures/reverse_dispersion_entropy.jl
@@ -0,0 +1,35 @@
+export reverse_dispersion
+
+function distance_to_whitenoise(p::Probabilities, N, m)
+    # We can safely skip non-occurring symbols, because they don't contribute
+    # to the sum in eq. 3 in Li et al. (2019)
+    return sum(abs2, p) - 1/(N^m)
+end
+
+"""
+    reverse_dispersion(x::AbstractVector{T}, s = GaussianSymbolization(5), m = 2, τ = 1)
+
+Compute the reverse dispersion entropy complexity measure (Li et al., 2019)[^Li2019],
+which measures how far from being white noise a signal is.
+
+[^Li2019]: Li, Y., Gao, X., & Wang, L. (2019). Reverse dispersion entropy: a new
+    complexity measure for sensor signal. Sensors, 19(23), 5203.
+"""
+function reverse_dispersion(x::AbstractVector{T}; s = GaussianSymbolization(5),
+        m = 2, τ = 1, normalize = true) where T <: Real
+    est = Dispersion(τ = τ, m = m, s = s)
+    p = probabilities(x, est)
+
+    # The following step combines distance information with the probabilities, so
+    # from here on, it is not possible to use `renyi_entropy` or similar methods, because
+    # we're not dealing with probabilities anymore.
+    Hrde = distance_to_whitenoise(p, s.n_categories, m)
+
+    if normalize
+        # The factor `f` considers *all* possible symbols (also non-occurring)
+        f = s.n_categories^m
+        return Hrde / (1 - (1/f))
+    else
+        return Hrde
+    end
+end
diff --git a/src/entropies/direct_entropies/entropy_dispersion.jl b/src/entropies/direct_entropies/entropy_dispersion.jl
diff --git a/src/entropies/entropies.jl b/src/entropies/entropies.jl
@@ -3,6 +3,5 @@ include("tsallis.jl")
 include("shannon.jl")
 include("convenience_definitions.jl")
 include("direct_entropies/nearest_neighbors/nearest_neighbors.jl")
-include("direct_entropies/entropy_dispersion.jl")
 
 # TODO: What else is included here from direct entropies?
diff --git a/src/probabilities_estimators/dispersion/dispersion.jl b/src/probabilities_estimators/dispersion/dispersion.jl
@@ -0,0 +1,113 @@
+using DelayEmbeddings
+using StatsBase
+
+export Dispersion
+
+"""
+    Dispersion(; s = GaussianSymbolization(5), m = 2, τ = 1, check_unique = true,
+        normalize = true)
+
+A probability estimator based on dispersion patterns, originally used by
+Rostaghi & Azami, 2016[^Rostaghi2016] to compute the "dispersion entropy", which
+characterizes the complexity and irregularity of a time series.
+
+Relative frequencies of dispersion patterns are computed using the symbolization scheme
+`s` with embedding dimension `m` and embedding delay `τ`. Recommended parameter
+values[^Li2018] are `m ∈ [2, 3]`, `τ = 1`, and `n_categories ∈ [3, 4, …, 8]` for the
+Gaussian mapping (defaults to 5).
+
+If `normalize == true`, then when used in combination with [`renyi_entropy`](@ref)
+(see below), the the dispersion entropy is normalized to `[0, 1]`.
+
+If `check_unique == true` (default), then it is checked that the input has
+more than one unique value. If `check_unique == false` and the input only has one
+unique element, then a `InexactError` is thrown when trying to compute probabilities.
+
+## Probabilities vs dispersion entropy
+
+The original dispersion entropy paper does not discuss the technique as a probability
+estimator per se, but does require a step where probabilities over dispersion patterns
+are explicitly computed. Hence, we provide `Dispersion` as a probability estimator.
+
+To compute (normalized) dispersion entropy of order `q` to a given `base` on the
+univariate input time series `x`, do:
+
+```julia
+renyi_entropy(x, Dispersion(normalize = true), base = base, q = q)
+```
+
+## Data requirements
+
+The input must have more than one unique element for the Gaussian mapping to be
+well-defined. Li et al. (2018) recommends that `x` has at least 1000 data points.
+
+[^Rostaghi2016]: Rostaghi, M., & Azami, H. (2016). Dispersion entropy: A measure for time-series analysis. IEEE Signal Processing Letters, 23(5), 610-614.
+[^Li2018]: Li, G., Guan, Q., & Yang, H. (2018). Noise reduction method of underwater acoustic signals based on CEEMDAN, effort-to-compress complexity, refined composite multiscale dispersion entropy and wavelet threshold denoising. Entropy, 21(1), 11.
+"""
+Base.@kwdef struct Dispersion <: ProbabilitiesEstimator
+    s = GaussianSymbolization(n_categories = 5)
+    m = 2
+    τ = 1
+    check_unique = false
+    normalize = true
+end
+
+export entropy_dispersion
+
+"""
+    embed_symbols(s::AbstractVector{T}, m, τ) {where T} → Dataset{m, T}
+
+From the symbols `sᵢ ∈ s`, create the embedding vectors (with dimension `m` and lag `τ`):
+
+```math
+s_i^D = \\{s_i, s_{i+\\tau}, \\ldots, s_{i+(m-1)\\tau} \\}
+```,
+
+where ``i = 1, 2, \\ldots, N - (m - 1)\\tau`` and `N = length(s)`.
+"""
+function embed_symbols(symbols::AbstractVector, m, τ)
+    return embed(symbols, m, τ)
+end
+
+function dispersion_histogram(x::AbstractDataset, N, m, τ)
+    return _non0hist(x.data, (N - (m - 1)*τ))
+end
+
+function probabilities(x::AbstractVector, est::Dispersion)
+    if est.check_unique
+        if length(unique(x)) == 1
+            symbols = repeat([1], length(x))
+        else
+            symbols = symbolize(x, est.s)
+        end
+    else
+        symbols = symbolize(x, est.s)
+    end
+    N = length(x)
+
+    # We must use genembed, not embed, to make sure the zero lag is included
+    m, τ = est.m, est.τ
+    τs = tuple((x for x in 0:-τ:-(m-1)*τ)...)
+    dispersion_patterns = genembed(symbols, τs, ones(m))
+    hist = dispersion_histogram(dispersion_patterns, N, est.m, est.τ)
+    p = Probabilities(hist)
+end
+
+function entropy_renyi(x::AbstractVector, est::Dispersion; q = 1, base = MathConstants.e)
+    p = probabilities(x, est)
+    dh = entropy_renyi(p, q = q, base = base)
+
+    n, m = est.s.n_categories, est.m
+
+    if est.normalize
+        # TODO: is is possible to normalize for general order `q`? Need to have a literature
+        # dive or figure it out manually.
+        if q == 1
+            return dh / log(base, n^m)
+        else
+            throw(ArgumentError("Normalization is not well defined when q != 1."))
+        end
+    else
+        return dh
+    end
+end
diff --git a/src/probabilities_estimators/probabilities_estimators.jl b/src/probabilities_estimators/probabilities_estimators.jl
@@ -3,4 +3,5 @@ include("rectangular_binning/rectangular_binning.jl")
 include("permutation_ordinal/symbolic.jl")
 include("kernel_density.jl")
 include("timescales/timescales.jl")
-include("transfer_operator/transfer_operator.jl")
+include("transfer_operator/transfer_operator.jl")
+include("dispersion/dispersion.jl")
diff --git a/test/complexity_measures.jl b/test/complexity_measures.jl
@@ -0,0 +1,7 @@
+using Entropies, Test
+
+@testset begin "Reverse dispersion entropy"
+    x = rand(100)
+    @test reverse_dispersion(x) isa Real
+    @test 0.0 <= reverse_dispersion(x, normalize = true) <= 1.0
+end
diff --git a/test/dispersion.jl b/test/dispersion.jl
@@ -0,0 +1,35 @@
+using Entropies, Test
+
+@testset "Dispersion methods" begin
+    x = rand(100)
+
+    @testset "Dispersion" begin
+
+        @testset "Internals" begin
+            # Li et al. (2018) recommends using at least 1000 data points when estimating
+            # dispersion entropy.
+            x = rand(1000)
+            n_categories = 4
+            m = 4
+            τ = 1
+            s = GaussianSymbolization(n_categories = n_categories)
+
+            # Symbols should be in the set [1, 2, …, n_categories].
+            symbols = Entropies.symbolize(x, s)
+            @test all([s ∈ collect(1:n_categories) for s in symbols])
+
+            # Dispersion patterns should have a normalized histogram that sums to 1.0.
+            dispersion_patterns = DelayEmbeddings.embed(symbols, m, τ)
+            hist = Entropies.dispersion_histogram(dispersion_patterns, length(x), m, τ)
+            @test sum(hist) ≈ 1.0
+        end
+
+        ps = probabilities(x, Dispersion())
+        @test ps isa Probabilities
+
+        de_norm = entropy_renyi(x, Dispersion(normalize = true), q = 1, base = 2)
+        @test 0.0 <= de_norm <= 1.0
+
+        @test_throws ArgumentError entropy_renyi(x, Dispersion(normalize = true), q = 2)
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -8,8 +8,10 @@ using Neighborhood: KDTree, BruteForce
 # TODO: This is how the tests should look like in the end:
 defaultname(file) = splitext(basename(file))[1]
 testfile(file, testname=defaultname(file)) = @testset "$testname" begin; include(file); end
-@testset "Entopies.jl" begin
+@testset "Entropies.jl" begin
     testfile("timescales.jl")
+    testfile("dispersion.jl")
+    testfile("complexity_measures.jl")
 end
 
 @testset "Histogram estimation" begin
@@ -325,29 +327,6 @@ end
         end
     end
 
-    @testset "Dispersion entropy" begin
-        # Li et al. (2018) recommends using at least 1000 data points when estimating
-        # dispersion entropy.
-        x = rand(1000)
-        n_categories = 4
-        m = 4
-        τ = 1
-        s = GaussianSymbolization(n_categories = n_categories)
-
-        # Symbols should be in the set [1, 2, …, n_categories].
-        symbols = Entropies.symbolize(x, s)
-        @test all([s ∈ collect(1:n_categories) for s in symbols])
-
-        # Dispersion patterns should have a normalized histogram that sums to 1.0.
-        dispersion_patterns = DelayEmbeddings.embed(symbols, m, τ)
-        hist = Entropies.dispersion_histogram(dispersion_patterns, length(x), m, τ)
-        @test sum(hist) ≈ 1.0
-
-        de = entropy_dispersion(x, s, m = 4, τ = 1)
-        @test typeof(de) <: Real
-        @test de >= 0.0
-    end
-
     @testset "Tsallis" begin
         p = Probabilities(repeat([1/5], 5))
         @assert round(entropy_tsallis(p, q = -1/2, k = 1), digits = 2) ≈ 6.79