From b015275d43c0c054802dd3d5075e17be7de4bbe6 Mon Sep 17 00:00:00 2001 From: Yann Cabanes Date: Thu, 4 Apr 2024 00:06:35 -0400 Subject: [PATCH 1/6] Improve gak --- tslearn/metrics/softdtw_variants.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tslearn/metrics/softdtw_variants.py b/tslearn/metrics/softdtw_variants.py index 3be98011..1c196bc3 100644 --- a/tslearn/metrics/softdtw_variants.py +++ b/tslearn/metrics/softdtw_variants.py @@ -240,10 +240,8 @@ def gak(s1, s2, sigma=1.0, be=None): # TODO: better doc (formula for the kernel be = instantiate_backend(be, s1, s2) s1 = be.array(s1) s2 = be.array(s2) - denom = be.sqrt( - unnormalized_gak(s1, s1, sigma=sigma, be=be) - * unnormalized_gak(s2, s2, sigma=sigma, be=be) - ) + denom = be.sqrt(unnormalized_gak(s1, s1, sigma=sigma, be=be)) * be.sqrt( + unnormalized_gak(s2, s2, sigma=sigma, be=be)) return unnormalized_gak(s1, s2, sigma=sigma, be=be) / denom From 48118014f339b7ae266491afa5c0efaf28b67e06 Mon Sep 17 00:00:00 2001 From: Yann Cabanes Date: Thu, 4 Apr 2024 09:36:02 -0400 Subject: [PATCH 2/6] Add data type specifications for gak to return the same data type than its input --- tslearn/metrics/softdtw_variants.py | 2 +- tslearn/metrics/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tslearn/metrics/softdtw_variants.py b/tslearn/metrics/softdtw_variants.py index 1c196bc3..03ff2808 100644 --- a/tslearn/metrics/softdtw_variants.py +++ b/tslearn/metrics/softdtw_variants.py @@ -54,7 +54,7 @@ def _gak(gram, be=None): gram = be.array(gram) sz1, sz2 = be.shape(gram) - cum_sum = be.zeros((sz1 + 1, sz2 + 1)) + cum_sum = be.zeros((sz1 + 1, sz2 + 1), dtype=gram.dtype) cum_sum[0, 0] = 1.0 for i in range(sz1): diff --git a/tslearn/metrics/utils.py b/tslearn/metrics/utils.py index 7c93f098..4109ee72 100644 --- a/tslearn/metrics/utils.py +++ b/tslearn/metrics/utils.py @@ -79,7 +79,7 @@ def _cdist_generic( if dataset2 is None: # Inspired from code by @GillesVandewiele: # https://github.com/rtavenar/tslearn/pull/128#discussion_r314978479 - matrix = be.zeros((len(dataset1), len(dataset1))) + matrix = be.zeros((len(dataset1), len(dataset1)), dtype=dataset1.dtype) indices = be.triu_indices( len(dataset1), k=0 if compute_diagonal else 1, m=len(dataset1) ) From c966e3f4c209070761d899eb420de8d3ad52845d Mon Sep 17 00:00:00 2001 From: Yann Cabanes Date: Thu, 4 Apr 2024 09:38:43 -0400 Subject: [PATCH 3/6] Add a test where gak is working without overflow for constant time series equal to zero of length 405 --- tslearn/tests/test_metrics.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tslearn/tests/test_metrics.py b/tslearn/tests/test_metrics.py index ebd971d7..8967fe63 100644 --- a/tslearn/tests/test_metrics.py +++ b/tslearn/tests/test_metrics.py @@ -446,6 +446,11 @@ def test_gak(): for array_type in array_types: backend = instantiate_backend(be, array_type) # GAK + gak_zeros = tslearn.metrics.gak( + s1=backend.zeros(405, dtype=backend.float64), + s2=backend.zeros(405, dtype=backend.float64), + sigma=1.0) + np.testing.assert_allclose(gak_zeros, desired=1, atol=1e-8) g = tslearn.metrics.cdist_gak( cast([[1, 2, 2, 3], [1.0, 2.0, 3.0, 4.0]], array_type), sigma=2.0, be=be ) From 18f3c1b9b21118499767d6046a179f8c94788f76 Mon Sep 17 00:00:00 2001 From: Yann Cabanes Date: Tue, 16 Apr 2024 09:41:24 -0400 Subject: [PATCH 4/6] fix dtype error in the function _cdist_generic in the file utils.py --- tslearn/metrics/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tslearn/metrics/utils.py b/tslearn/metrics/utils.py index 4109ee72..84470543 100644 --- a/tslearn/metrics/utils.py +++ b/tslearn/metrics/utils.py @@ -89,7 +89,8 @@ def _cdist_generic( delayed(dist_fun)(dataset1[i], dataset1[j], *args, **kwargs) for i in range(len(dataset1)) for j in range(i if compute_diagonal else i + 1, len(dataset1)) - ) + ), + dtype=matrix.dtype ) indices = be.tril_indices(len(dataset1), k=-1, m=len(dataset1)) From edbe59d52931ca33a33f33b2519f0d7c81ad4752 Mon Sep 17 00:00:00 2001 From: Yann Cabanes Date: Tue, 23 Apr 2024 15:18:29 -0400 Subject: [PATCH 5/6] Add precisions about GAK normalization in the documentation file kernel.rst --- docs/user_guide/kernel.rst | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/user_guide/kernel.rst b/docs/user_guide/kernel.rst index 46dbede5..f417dfb8 100644 --- a/docs/user_guide/kernel.rst +++ b/docs/user_guide/kernel.rst @@ -52,7 +52,7 @@ Global Alignment Kernel The Global Alignment Kernel (GAK) is a kernel that operates on time series. -It is defined, for a given bandwidth :math:`\sigma`, as: +The unnormalized GAK is defined, for a given bandwidth :math:`\sigma`, as: .. math:: @@ -64,6 +64,15 @@ It is defined, for a given bandwidth :math:`\sigma`, as: where :math:`\mathcal{A}(\mathbf{x}, \mathbf{y})` is the set of all possible alignments between series :math:`\mathbf{x}` and :math:`\mathbf{y}`. +Note that the function ``gak`` is normalized in ``tslearn``: it corresponds to the quotient + +.. math:: + + \text{gak}(\mathbf{x}, \mathbf{y}) = \frac{k(\mathbf{x}, \mathbf{y})}{\sqrt{k(\mathbf{x}, \mathbf{x})k(\mathbf{y}, \mathbf{y})}} + +to ensure that :math:`\text{gak}(\mathbf{x}, \mathbf{x})=1` for all :math:`\mathbf{x}` +and :math:`\text{gak}(\mathbf{x}, \mathbf{y}) \in [0, 1]` for all :math:`\mathbf{x}, \mathbf{y}`. + It is advised in [1]_ to set the bandwidth :math:`\sigma` as a multiple of a simple estimate of the median distance of different points observed in different time-series of your training set, scaled by the square root of the @@ -81,7 +90,7 @@ This estimate is made available in ``tslearn`` through Note however that, on long time series, this estimate can lead to numerical overflows, which smaller values can avoid. -Finally, GAK is related to :ref:`softDTW ` [3]_ through the +Finally, the unnormalized GAK is related to :ref:`softDTW ` [3]_ through the following formula: .. math:: From 4f33f768b07cc1bdda45c28ccb5f8c284a88161e Mon Sep 17 00:00:00 2001 From: Yann Cabanes Date: Tue, 23 Apr 2024 15:49:22 -0400 Subject: [PATCH 6/6] Improve gak documentation in kernel.rst --- docs/user_guide/kernel.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user_guide/kernel.rst b/docs/user_guide/kernel.rst index f417dfb8..0197abd2 100644 --- a/docs/user_guide/kernel.rst +++ b/docs/user_guide/kernel.rst @@ -70,7 +70,7 @@ Note that the function ``gak`` is normalized in ``tslearn``: it corresponds to t \text{gak}(\mathbf{x}, \mathbf{y}) = \frac{k(\mathbf{x}, \mathbf{y})}{\sqrt{k(\mathbf{x}, \mathbf{x})k(\mathbf{y}, \mathbf{y})}} -to ensure that :math:`\text{gak}(\mathbf{x}, \mathbf{x})=1` for all :math:`\mathbf{x}` +This normalization ensures that :math:`\text{gak}(\mathbf{x}, \mathbf{x})=1` for all :math:`\mathbf{x}` and :math:`\text{gak}(\mathbf{x}, \mathbf{y}) \in [0, 1]` for all :math:`\mathbf{x}, \mathbf{y}`. It is advised in [1]_ to set the bandwidth :math:`\sigma` as a multiple of a