From 5e0fa46052adfe119b1e1ce1e6e05d38efd3bb89 Mon Sep 17 00:00:00 2001
From: enricofallacara <enrico.fallacara@gmail.com>
Date: Tue, 15 Dec 2020 15:08:12 +0100
Subject: [PATCH 1/3] Parallelized time series k means barycenter update
 procedure

---
 tslearn/barycenters.py | 185 +++++++-------------------
 tslearn/clustering.py  | 101 ++++++++------
 tslearn/metrics.py     | 296 ++---------------------------------------
 3 files changed, 121 insertions(+), 461 deletions(-)

diff --git a/tslearn/barycenters.py b/tslearn/barycenters.py
index edf0d6bf8..162ae9f14 100644
--- a/tslearn/barycenters.py
+++ b/tslearn/barycenters.py
@@ -1,16 +1,12 @@
 """
 The :mod:`tslearn.barycenters` module gathers algorithms for time series
 barycenter computation.
-
 A barycenter (or *Fréchet mean*) is a time series :math:`b` which minimizes
 the sum of squared distances to the time series of a given data set :math:`x`:
-
 .. math:: \\min \\sum_i d( b, x_i )^2
-
 Only the methods :func:`dtw_barycenter_averaging` and
 :func:`softdtw_barycenter` can operate on variable-length time-series
 (see :ref:`here<variable-length-barycenter>`).
-
 See the :ref:`barycenter examples<sphx_glr_auto_examples_clustering_plot_barycenters.py>`
 for an overview.
 """
@@ -23,6 +19,8 @@
 from sklearn.exceptions import ConvergenceWarning
 import warnings
 from sklearn.utils import check_random_state
+from joblib import parallel_backend, Parallel, delayed
+import psutil
 
 from tslearn.utils import to_time_series_dataset, check_equal_size, \
     to_time_series, ts_size
@@ -44,25 +42,20 @@ def _set_weights(w, n):
 
 def euclidean_barycenter(X, weights=None):
     """Standard Euclidean barycenter computed from a set of time series.
-
     Parameters
     ----------
     X : array-like, shape=(n_ts, sz, d)
         Time series dataset.
-
     weights: None or array
         Weights of each X[i]. Must be the same size as len(X).
         If None, uniform weights are used.
-
     Returns
     -------
     numpy.array of shape (sz, d)
         Barycenter of the provided time series dataset.
-
     Notes
     -----
         This method requires a dataset of equal-sized time series
-
     Examples
     --------
     >>> time_series = [[1, 2, 3, 4], [1, 2, 4, 5]]
@@ -132,52 +125,41 @@ def dtw_barycenter_averaging_petitjean(X, barycenter_size=None,
                                        metric_params=None,
                                        verbose=False):
     """DTW Barycenter Averaging (DBA) method.
-
     DBA was originally presented in [1]_.
     This implementation is not the one documented in the API, but is kept
     in the codebase to check the documented one for non-regression.
-
     Parameters
     ----------
     X : array-like, shape=(n_ts, sz, d)
         Time series dataset.
-
     barycenter_size : int or None (default: None)
         Size of the barycenter to generate. If None, the size of the barycenter
         is that of the data provided at fit
         time or that of the initial barycenter if specified.
-
     init_barycenter : array or None (default: None)
         Initial barycenter to start from for the optimization process.
-
     max_iter : int (default: 30)
         Number of iterations of the Expectation-Maximization optimization
         procedure.
-
     tol : float (default: 1e-5)
         Tolerance to use for early stopping: if the decrease in cost is lower
         than this value, the
         Expectation-Maximization procedure stops.
-
     weights: None or array
         Weights of each X[i]. Must be the same size as len(X).
         If None, uniform weights are used.
-
     metric_params: dict or None (default: None)
         DTW constraint parameters to be used.
         See :ref:`tslearn.metrics.dtw_path <fun-tslearn.metrics.dtw_path>` for
         a list of accepted parameters
         If None, no constraint is used for DTW computations.
-
     verbose : boolean (default: False)
         Whether to print information about the cost at each iteration or not.
-
     Returns
     -------
     numpy.array of shape (barycenter_size, d) or (sz, d) if barycenter_size \
             is None
         DBA barycenter of the provided time series dataset.
-
     Examples
     --------
     >>> time_series = [[1, 2, 3, 4], [1, 2, 4, 5]]
@@ -210,14 +192,14 @@ def dtw_barycenter_averaging_petitjean(X, barycenter_size=None,
     array([[2.5],
            [2.5],
            [2.5]])
-
     References
     ----------
     .. [1] F. Petitjean, A. Ketterlin & P. Gancarski. A global averaging method
        for dynamic time warping, with applications to clustering. Pattern
        Recognition, Elsevier, 2011, Vol. 44, Num. 3, pp. 678-693
     """
-    X_ = to_time_series_dataset(X)
+    #X_ = to_time_series_dataset(X)
+    X_ = X
     if barycenter_size is None:
         barycenter_size = X_.shape[1]
     weights = _set_weights(weights, X_.shape[0])
@@ -248,29 +230,23 @@ def dtw_barycenter_averaging_petitjean(X, barycenter_size=None,
 def _mm_assignment(X, barycenter, weights, metric_params=None):
     """Computes item assignement based on DTW alignments and return cost as a
     bonus.
-
     Parameters
     ----------
     X : numpy.array of shape (n, sz, d)
         Time-series to be averaged
-
     barycenter : numpy.array of shape (barycenter_size, d)
         Barycenter as computed at the current step of the algorithm.
-
     weights: array
         Weights of each X[i]. Must be the same size as len(X).
-
     metric_params: dict or None (default: None)
         DTW constraint parameters to be used.
         See :ref:`tslearn.metrics.dtw_path <fun-tslearn.metrics.dtw_path>` for
         a list of accepted parameters
         If None, no constraint is used for DTW computations.
-
     Returns
     -------
     list of index pairs
         Warping paths
-
     float
         Current alignment cost
     """
@@ -279,92 +255,85 @@ def _mm_assignment(X, barycenter, weights, metric_params=None):
     n = X.shape[0]
     cost = 0.
     list_p_k = []
-    for i in range(n):
-        path, dist_i = dtw_path(barycenter, X[i], **metric_params)
-        cost += dist_i ** 2 * weights[i]
-        list_p_k.append(path)
+    #with parallel_backend('threading'):
+    res = Parallel(backend='threading',prefer="threads",require='sharedmem',n_jobs=15,verbose=10)(delayed(dtw_path)(barycenter, X[i], global_constraint="sakoe_chiba",sakoe_chiba_radius=1) for i in range(n))
+    paths, dists = zip(*res)
+    paths = list(paths)
+    dists = list(dists)
+    cost = numpy.sum(numpy.multiply(numpy.power(dists, 2),weights))
     cost /= weights.sum()
+    list_p_k = paths
+    #for i in range(n):
+    #    path, dist_i = dtw_path(barycenter, X[i], **metric_params)
+    #    cost += dist_i ** 2 * weights[i]
+    #    list_p_k.append(path)
+    #cost /= weights.sum()
     return list_p_k, cost
 
 
 def _subgradient_valence_warping(list_p_k, barycenter_size, weights):
     """Compute Valence and Warping matrices from paths.
-
     Valence matrices are denoted :math:`V^{(k)}` and Warping matrices are
     :math:`W^{(k)}` in [1]_.
-
     This function returns a list of :math:`V^{(k)}` diagonals (as a vector)
     and a list of :math:`W^{(k)}` matrices.
-
     Parameters
     ----------
     list_p_k : list of index pairs
         Warping paths
-
     barycenter_size : int
         Size of the barycenter to generate.
-
     weights: array
         Weights of each X[i]. Must be the same size as len(X).
-
     Returns
     -------
     list of numpy.array of shape (barycenter_size, )
         list of weighted :math:`V^{(k)}` diagonals (as a vector)
-
     list of numpy.array of shape (barycenter_size, sz_k)
         list of weighted :math:`W^{(k)}` matrices
-
     References
     ----------
-
     .. [1] D. Schultz and B. Jain. Nonsmooth Analysis and Subgradient Methods
        for Averaging in Dynamic Time Warping Spaces.
        Pattern Recognition, 74, 340-358.
     """
-    list_v_k = []
-    list_w_k = []
-    for k, p_k in enumerate(list_p_k):
-        sz_k = p_k[-1][1] + 1
-        w_k = numpy.zeros((barycenter_size, sz_k))
+    def create_w_k(p_k, barycenter_size):
+        w_k = numpy.zeros((barycenter_size, p_k[-1][1] + 1))
         for i, j in p_k:
             w_k[i, j] = 1.
-        list_w_k.append(w_k * weights[k])
-        list_v_k.append(w_k.sum(axis=1) * weights[k])
+        return w_k			
+    w_k_ones = Parallel(backend='threading',prefer='threads',require='sharedmem', n_jobs=15, verbose = True)(delayed(create_w_k)(p_k,barycenter_size) for p_k in list_p_k)
+    def mult_w_k_weights(w_k, weight):
+        return w_k * weight
+    list_w_k = Parallel(backend='threading',prefer='threads',require='sharedmem', n_jobs=15, verbose = True)(delayed(mult_w_k_weights)(w_k,weights[k]) for k,w_k in enumerate(w_k_ones))
+    def mult_w_k_sum_wieghts(w_k, weight):
+        return w_k.sum(axis=1) * weight
+    list_v_k = Parallel(backend='threading',prefer='threads',require='sharedmem', n_jobs=15, verbose = True)(delayed(mult_w_k_sum_wieghts)(w_k,weight) for (w_k, weight) in zip(w_k_ones, weights))
     return list_v_k, list_w_k
 
 
 def _mm_valence_warping(list_p_k, barycenter_size, weights):
     """Compute Valence and Warping matrices from paths.
-
     Valence matrices are denoted :math:`V^{(k)}` and Warping matrices are
     :math:`W^{(k)}` in [1]_.
-
     This function returns the sum of :math:`V^{(k)}` diagonals (as a vector)
     and a list of :math:`W^{(k)}` matrices.
-
     Parameters
     ----------
     list_p_k : list of index pairs
         Warping paths
-
     barycenter_size : int
         Size of the barycenter to generate.
-
     weights: array
         Weights of each X[i]. Must be the same size as len(X).
-
     Returns
     -------
     numpy.array of shape (barycenter_size, )
         sum of weighted :math:`V^{(k)}` diagonals (as a vector)
-
     list of numpy.array of shape (barycenter_size, sz_k)
         list of weighted :math:`W^{(k)}` matrices
-
     References
     ----------
-
     .. [1] D. Schultz and B. Jain. Nonsmooth Analysis and Subgradient Methods
        for Averaging in Dynamic Time Warping Spaces.
        Pattern Recognition, 74, 340-358.
@@ -373,79 +342,71 @@ def _mm_valence_warping(list_p_k, barycenter_size, weights):
         list_p_k=list_p_k,
         barycenter_size=barycenter_size,
         weights=weights)
-    diag_sum_v_k = numpy.zeros(list_v_k[0].shape)
-    for v_k in list_v_k:
-        diag_sum_v_k += v_k
+    #diag_sum_v_k = numpy.zeros(list_v_k[0].shape)
+    #for v_k in list_v_k:
+    #    diag_sum_v_k += v_k
+    diag_sum_v_k = Parallel(backend='threading',prefer='threads',require='sharedmem', n_jobs=15, verbose = True)(delayed(numpy.sum)(x) for x in zip(*numpy.array(list_v_k)))
+    diag_sum_v_k = numpy.array(diag_sum_v_k)
     return diag_sum_v_k, list_w_k
 
 
 def _mm_update_barycenter(X, diag_sum_v_k, list_w_k):
     """Update barycenters using the formula from Algorithm 2 in [1]_.
-
     Parameters
     ----------
     X : numpy.array of shape (n, sz, d)
         Time-series to be averaged
-
     diag_sum_v_k : numpy.array of shape (barycenter_size, )
         sum of weighted :math:`V^{(k)}` diagonals (as a vector)
-
     list_w_k : list of numpy.array of shape (barycenter_size, sz_k)
         list of weighted :math:`W^{(k)}` matrices
-
     Returns
     -------
     numpy.array of shape (barycenter_size, d)
         Updated barycenter
-
     References
     ----------
-
     .. [1] D. Schultz and B. Jain. Nonsmooth Analysis and Subgradient Methods
        for Averaging in Dynamic Time Warping Spaces.
        Pattern Recognition, 74, 340-358.
     """
-    d = X.shape[2]
-    barycenter_size = diag_sum_v_k.shape[0]
-    sum_w_x = numpy.zeros((barycenter_size, d))
-    for k, (w_k, x_k) in enumerate(zip(list_w_k, X)):
-        sum_w_x += w_k.dot(x_k[:ts_size(x_k)])
-    barycenter = numpy.diag(1. / diag_sum_v_k).dot(sum_w_x)
+    #d = X.shape[2]
+    #barycenter_size = diag_sum_v_k.shape[0]
+    #sum_w_x = numpy.zeros((barycenter_size, d))
+    #for k, (w_k, x_k) in enumerate(zip(list_w_k, X)):
+    #    sum_w_x += w_k.dot(x_k[:ts_size(x_k)])
+    sum_w_x = Parallel(backend='threading',prefer='threads',require='sharedmem', n_jobs=15, verbose = True)(delayed(numpy.dot)(w_k,x_k[:ts_size(x_k)]) for (w_k, x_k) in zip(list_w_k, X))
+    sum_w_x = Parallel(backend='threading',prefer='threads',require='sharedmem',n_jobs=15, verbose = True)(delayed(numpy.sum)(x) for x in zip(*numpy.array(sum_w_x)))
+    sum_w_x = numpy.array(sum_w_x).reshape((len(sum_w_x), 1))
+    inverse_diag_sum_v_k = numpy.reciprocal(diag_sum_v_k)
+    inverse_diag_sum_v_k = numpy.diag(inverse_diag_sum_v_k)
+    barycenter = inverse_diag_sum_v_k.dot(sum_w_x)
     return barycenter
 
 
 def _subgradient_update_barycenter(X, list_diag_v_k, list_w_k, weights_sum,
                                    barycenter, eta):
     """Update barycenters using the formula from Algorithm 1 in [1]_.
-
     Parameters
     ----------
     X : numpy.array of shape (n, sz, d)
         Time-series to be averaged
-
     list_diag_v_k : list of numpy.array of shape (barycenter_size, )
         list of weighted :math:`V^{(k)}` diagonals (as vectors)
-
     list_w_k : list of numpy.array of shape (barycenter_size, sz_k)
         list of weighted :math:`W^{(k)}` matrices
-
     weights_sum : float
         sum of weights applied to matrices :math:`V^{(k)}` and :math:`W^{(k)}`
-
     barycenter : numpy.array of shape (barycenter_size, d)
         Barycenter as computed at the previous iteration of the algorithm
-
     eta : float
         Step-size for the subgradient descent algorithm
-
     Returns
     -------
     numpy.array of shape (barycenter_size, d)
         Updated barycenter
-
     References
     ----------
-
     .. [1] D. Schultz and B. Jain. Nonsmooth Analysis and Subgradient Methods
        for Averaging in Dynamic Time Warping Spaces.
        Pattern Recognition, 74, 340-358.
@@ -466,57 +427,45 @@ def dtw_barycenter_averaging(X, barycenter_size=None, init_barycenter=None,
                              verbose=False, n_init=1):
     """DTW Barycenter Averaging (DBA) method estimated through
     Expectation-Maximization algorithm.
-
     DBA was originally presented in [1]_.
     This implementation is based on a idea from [2]_ (Majorize-Minimize Mean
     Algorithm).
-
     Parameters
     ----------
     X : array-like, shape=(n_ts, sz, d)
         Time series dataset.
-
     barycenter_size : int or None (default: None)
         Size of the barycenter to generate. If None, the size of the barycenter
         is that of the data provided at fit
         time or that of the initial barycenter if specified.
-
     init_barycenter : array or None (default: None)
         Initial barycenter to start from for the optimization process.
-
     max_iter : int (default: 30)
         Number of iterations of the Expectation-Maximization optimization
         procedure.
-
     tol : float (default: 1e-5)
         Tolerance to use for early stopping: if the decrease in cost is lower
         than this value, the
         Expectation-Maximization procedure stops.
-
     weights: None or array
         Weights of each X[i]. Must be the same size as len(X).
         If None, uniform weights are used.
-
     metric_params: dict or None (default: None)
         DTW constraint parameters to be used.
         See :ref:`tslearn.metrics.dtw_path <fun-tslearn.metrics.dtw_path>` for
         a list of accepted parameters
         If None, no constraint is used for DTW computations.
-
     verbose : boolean (default: False)
         Whether to print information about the cost at each iteration or not.
-
     n_init : int (default: 1)
         Number of different initializations to be tried (useful only is
         init_barycenter is set to None, otherwise, all trials will reach the
         same performance)
-
     Returns
     -------
     numpy.array of shape (barycenter_size, d) or (sz, d) if barycenter_size \
             is None
         DBA barycenter of the provided time series dataset.
-
     Examples
     --------
     >>> time_series = [[1, 2, 3, 4], [1, 2, 4, 5]]
@@ -548,13 +497,11 @@ def dtw_barycenter_averaging(X, barycenter_size=None, init_barycenter=None,
     array([[2.5],
            [2.5],
            [2.5]])
-
     References
     ----------
     .. [1] F. Petitjean, A. Ketterlin & P. Gancarski. A global averaging method
        for dynamic time warping, with applications to clustering. Pattern
        Recognition, Elsevier, 2011, Vol. 44, Num. 3, pp. 678-693
-
     .. [2] D. Schultz and B. Jain. Nonsmooth Analysis and Subgradient Methods
        for Averaging in Dynamic Time Warping Spaces.
        Pattern Recognition, 74, 340-358.
@@ -587,46 +534,36 @@ def dtw_barycenter_averaging_one_init(X, barycenter_size=None,
                                       verbose=False):
     """DTW Barycenter Averaging (DBA) method estimated through
     Expectation-Maximization algorithm.
-
     DBA was originally presented in [1]_.
     This implementation is based on a idea from [2]_ (Majorize-Minimize Mean
     Algorithm).
-
     Parameters
     ----------
     X : array-like, shape=(n_ts, sz, d)
         Time series dataset.
-
     barycenter_size : int or None (default: None)
         Size of the barycenter to generate. If None, the size of the barycenter
         is that of the data provided at fit
         time or that of the initial barycenter if specified.
-
     init_barycenter : array or None (default: None)
         Initial barycenter to start from for the optimization process.
-
     max_iter : int (default: 30)
         Number of iterations of the Expectation-Maximization optimization
         procedure.
-
     tol : float (default: 1e-5)
         Tolerance to use for early stopping: if the decrease in cost is lower
         than this value, the
         Expectation-Maximization procedure stops.
-
     weights: None or array
         Weights of each X[i]. Must be the same size as len(X).
         If None, uniform weights are used.
-
     metric_params: dict or None (default: None)
         DTW constraint parameters to be used.
         See :ref:`tslearn.metrics.dtw_path <fun-tslearn.metrics.dtw_path>` for
         a list of accepted parameters
         If None, no constraint is used for DTW computations.
-
     verbose : boolean (default: False)
         Whether to print information about the cost at each iteration or not.
-
     Returns
     -------
     numpy.array of shape (barycenter_size, d) or (sz, d) if barycenter_size \
@@ -634,34 +571,32 @@ def dtw_barycenter_averaging_one_init(X, barycenter_size=None,
         DBA barycenter of the provided time series dataset.
     float
         Associated inertia
-
     References
     ----------
     .. [1] F. Petitjean, A. Ketterlin & P. Gancarski. A global averaging method
        for dynamic time warping, with applications to clustering. Pattern
        Recognition, Elsevier, 2011, Vol. 44, Num. 3, pp. 678-693
-
     .. [2] D. Schultz and B. Jain. Nonsmooth Analysis and Subgradient Methods
        for Averaging in Dynamic Time Warping Spaces.
        Pattern Recognition, 74, 340-358.
     """
-    X_ = to_time_series_dataset(X)
+    #X_ = to_time_series_dataset(X)
     if barycenter_size is None:
-        barycenter_size = X_.shape[1]
-    weights = _set_weights(weights, X_.shape[0])
+        barycenter_size = X.shape[1]
+    weights = _set_weights(weights, X.shape[0])
     if init_barycenter is None:
-        barycenter = _init_avg(X_, barycenter_size)
+        barycenter = _init_avg(X, barycenter_size)
     else:
         barycenter_size = init_barycenter.shape[0]
         barycenter = init_barycenter
     cost_prev, cost = numpy.inf, numpy.inf
     for it in range(max_iter):
-        list_p_k, cost = _mm_assignment(X_, barycenter, weights, metric_params)
+        list_p_k, cost = _mm_assignment(X, barycenter, weights, metric_params)
         diag_sum_v_k, list_w_k = _mm_valence_warping(list_p_k, barycenter_size,
                                                      weights)
         if verbose:
             print("[DBA] epoch %d, cost: %.3f" % (it + 1, cost))
-        barycenter = _mm_update_barycenter(X_, diag_sum_v_k, list_w_k)
+        barycenter = _mm_update_barycenter(X, diag_sum_v_k, list_w_k)
         if abs(cost_prev - cost) < tol:
             break
         elif cost_prev < cost:
@@ -682,66 +617,52 @@ def dtw_barycenter_averaging_subgradient(X, barycenter_size=None,
                                          metric_params=None, verbose=False):
     """DTW Barycenter Averaging (DBA) method estimated through subgradient
     descent algorithm.
-
     DBA was originally presented in [1]_.
     This implementation is based on a idea from [2]_ (Stochastic Subgradient
     Mean Algorithm).
-
     Parameters
     ----------
     X : array-like, shape=(n_ts, sz, d)
         Time series dataset.
-
     barycenter_size : int or None (default: None)
         Size of the barycenter to generate. If None, the size of the barycenter
         is that of the data provided at fit
         time or that of the initial barycenter if specified.
-
     init_barycenter : array or None (default: None)
         Initial barycenter to start from for the optimization process.
-
     max_iter : int (default: 30)
         Number of iterations of the Expectation-Maximization optimization
         procedure.
-
     initial_step_size : float (default: 0.05)
         Initial step size for the subgradient descent algorithm.
         Default value is the one suggested in [2]_.
-
     final_step_size : float (default: 0.005)
         Final step size for the subgradient descent algorithm.
         Default value is the one suggested in [2]_.
-
     tol : float (default: 1e-5)
         Tolerance to use for early stopping: if the decrease in cost is lower
         than this value, the
         Expectation-Maximization procedure stops.
-
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
-
     weights: None or array
         Weights of each X[i]. Must be the same size as len(X).
         If None, uniform weights are used.
-
     metric_params: dict or None (default: None)
         DTW constraint parameters to be used.
         See :ref:`tslearn.metrics.dtw_path <fun-tslearn.metrics.dtw_path>` for
         a list of accepted parameters
         If None, no constraint is used for DTW computations.
-
     verbose : boolean (default: False)
         Whether to print information about the cost at each iteration or not.
-
     Returns
     -------
     numpy.array of shape (barycenter_size, d) or (sz, d) if barycenter_size \
             is None
         DBA barycenter of the provided time series dataset.
-
     Examples
     --------
     >>> time_series = [[1, 2, 3, 4], [1, 2, 4, 5]]
@@ -754,20 +675,19 @@ def dtw_barycenter_averaging_subgradient(X, barycenter_size=None,
            [2. ],
            [3.5...],
            [4.5...]])
-
     References
     ----------
     .. [1] F. Petitjean, A. Ketterlin & P. Gancarski. A global averaging method
        for dynamic time warping, with applications to clustering. Pattern
        Recognition, Elsevier, 2011, Vol. 44, Num. 3, pp. 678-693
-
     .. [2] D. Schultz and B. Jain. Nonsmooth Analysis and Subgradient Methods
        for Averaging in Dynamic Time Warping Spaces.
        Pattern Recognition, 74, 340-358.
     """
     rng = check_random_state(random_state)
 
-    X_ = to_time_series_dataset(X)
+    #X_ = to_time_series_dataset(X)
+    X_ = X
     if barycenter_size is None:
         barycenter_size = X_.shape[1]
     weights = _set_weights(weights, X_.shape[0])
@@ -832,9 +752,7 @@ def softdtw_barycenter(X, gamma=1.0, weights=None, method="L-BFGS-B", tol=1e-3,
                        max_iter=50, init=None):
     """Compute barycenter (time series averaging) under the soft-DTW [1]
     geometry.
-
     Soft-DTW was originally presented in [1]_.
-
     Parameters
     ----------
     X : array-like, shape=(n_ts, sz, d)
@@ -855,13 +773,11 @@ def softdtw_barycenter(X, gamma=1.0, weights=None, method="L-BFGS-B", tol=1e-3,
     init: array or None (default: None)
         Initial barycenter to start from for the optimization process.
         If `None`, euclidean barycenter is used as a starting point.
-
     Returns
     -------
     numpy.array of shape (bsz, d) where `bsz` is the size of the `init` array \
             if provided or `sz` otherwise
         Soft-DTW barycenter of the provided time series dataset.
-
     Examples
     --------
     >>> time_series = [[1, 2, 3, 4], [1, 2, 4, 5]]
@@ -877,7 +793,6 @@ def softdtw_barycenter(X, gamma=1.0, weights=None, method="L-BFGS-B", tol=1e-3,
            [2.67573269],
            [3.51057026],
            [4.33645802]])
-
     References
     ----------
     .. [1] M. Cuturi, M. Blondel "Soft-DTW: a Differentiable Loss Function for
diff --git a/tslearn/clustering.py b/tslearn/clustering.py
index e94238b07..e249e311f 100644
--- a/tslearn/clustering.py
+++ b/tslearn/clustering.py
@@ -2,7 +2,7 @@
 The :mod:`tslearn.clustering` module gathers time series specific clustering
 algorithms.
 
-**User guide:** See the :ref:`Clustering <clustering>` section for further 
+**User guide:** See the :ref:`Clustering <clustering>` section for further
  details.
 """
 
@@ -22,6 +22,7 @@
 from scipy.spatial.distance import cdist
 import numpy
 import warnings
+from joblib import parallel_backend, Parallel, delayed
 
 from tslearn.metrics import cdist_gak, cdist_dtw, cdist_soft_dtw, \
     cdist_soft_dtw_normalized, sigma_gak
@@ -181,14 +182,14 @@ def _compute_inertia(distances, assignments, squared=True):
 
 
 def silhouette_score(X, labels, metric=None, sample_size=None,
-                     metric_params=None, n_jobs=None, verbose=0, 
+                     metric_params=None, n_jobs=None, verbose=0,
                      random_state=None, **kwds):
     """Compute the mean Silhouette Coefficient of all samples (cf.  [1]_ and
     [2]_).
 
     Read more in the `scikit-learn documentation
     <http://scikit-learn.org/stable/modules/clustering.html\
-    #silhouette-coefficient>`_.
+    # silhouette-coefficient>`_.
 
     Parameters
     ----------
@@ -228,7 +229,7 @@ def silhouette_score(X, labels, metric=None, sample_size=None,
 
     verbose : int (default: 0)
         If nonzero, print information about the inertia while learning
-        the model and joblib progress messages are printed.  
+        the model and joblib progress messages are printed.
 
     random_state : int, RandomState instance or None, optional (default: None)
         The generator used to randomly select a subset of samples.  If int,
@@ -287,7 +288,7 @@ def silhouette_score(X, labels, metric=None, sample_size=None,
     if metric == "precomputed":
         sklearn_X = X
     elif metric == "dtw" or metric is None:
-        sklearn_X = cdist_dtw(X, n_jobs=n_jobs, verbose=verbose,
+        sklearn_X = cdist_dtw(X, n_jobs=n_jobs, verbose=verbose, global_constraint='sakoe_chiba',
                               **metric_params_)
     elif metric == "softdtw":
         sklearn_X = cdist_soft_dtw_normalized(X, **metric_params_)
@@ -328,7 +329,7 @@ class KernelKMeans(ClusterMixin, BaseModelPackage, TimeSeriesBaseEstimator):
     ----------
     n_clusters : int (default: 3)
         Number of clusters to form.
-        
+
     kernel : string, or callable (default: "gak")
         The kernel should either be "gak", in which case the Global Alignment
         Kernel from [2]_ is used or a value that is accepted as a metric
@@ -349,12 +350,12 @@ class KernelKMeans(ClusterMixin, BaseModelPackage, TimeSeriesBaseEstimator):
         Number of time the k-means algorithm will be run with different
         centroid seeds. The final results will be the
         best output of n_init consecutive runs in terms of inertia.
-        
+
     kernel_params : dict or None (default: None)
         Kernel parameters to be passed to the kernel function.
         None means no kernel parameter is set.
-        For Global Alignment Kernel, the only parameter of interest is `sigma`. 
-        If set to 'auto', it is computed based on a sampling of the training 
+        For Global Alignment Kernel, the only parameter of interest is `sigma`.
+        If set to 'auto', it is computed based on a sampling of the training
         set
         (cf :ref:`tslearn.metrics.sigma_gak <fun-tslearn.metrics.sigma_gak>`).
         If no specific value is set for `sigma`, its defaults to 1.
@@ -363,10 +364,10 @@ class KernelKMeans(ClusterMixin, BaseModelPackage, TimeSeriesBaseEstimator):
         Bandwidth parameter for the Global Alignment kernel. If set to 'auto',
         it is computed based on a sampling of the training set
         (cf :ref:`tslearn.metrics.sigma_gak <fun-tslearn.metrics.sigma_gak>`)
-        
+
         .. deprecated:: 0.4
-            Setting `sigma` directly as a parameter for KernelKMeans and 
-            GlobalAlignmentKernelKMeans is deprecated in version 0.4 and will 
+            Setting `sigma` directly as a parameter for KernelKMeans and
+            GlobalAlignmentKernelKMeans is deprecated in version 0.4 and will
             be removed in 0.6. Use `kernel_params` instead.
 
     n_jobs : int or None, optional (default=None)
@@ -378,7 +379,7 @@ class KernelKMeans(ClusterMixin, BaseModelPackage, TimeSeriesBaseEstimator):
         for more details.
 
     verbose : int (default: 0)
-        If nonzero, joblib progress messages are printed.  
+        If nonzero, joblib progress messages are printed.
 
     random_state : integer or numpy.RandomState, optional
         Generator used to initialize the centers. If an integer is given, it
@@ -658,6 +659,7 @@ def __init__(self, **kwargs):
 
 class TimeSeriesCentroidBasedClusteringMixin:
     """Mixin class for centroid-based clustering of time series."""
+
     def _post_fit(self, X_fitted, centroids, inertia):
         if numpy.isfinite(inertia) and (centroids is not None):
             self.cluster_centers_ = centroids
@@ -722,7 +724,7 @@ class TimeSeriesKMeans(TransformerMixin, ClusterMixin,
 
     verbose : int (default: 0)
         If nonzero, print information about the inertia while learning
-        the model and joblib progress messages are printed.  
+        the model and joblib progress messages are printed.
 
     random_state : integer or numpy.RandomState, optional
         Generator used to initialize the centers. If an integer is given, it
@@ -836,7 +838,7 @@ def _fit_one_init(self, X, x_squared_norms, rs):
                 if self.metric == "dtw":
                     def metric_fun(x, y):
                         return cdist_dtw(x, y, n_jobs=self.n_jobs,
-                                         verbose=self.verbose, **metric_params)
+                                         verbose=self.verbose, global_constraint='sakoe_chiba', **metric_params)
 
                 elif self.metric == "softdtw":
                     def metric_fun(x, y):
@@ -857,32 +859,30 @@ def metric_fun(x, y):
                              "is invalid" % self.init)
         self.cluster_centers_ = _check_full_length(self.cluster_centers_)
         old_inertia = numpy.inf
-
         for it in range(self.max_iter):
+            print('Iteration number: ' + str(it))
             self._assign(X)
             if self.verbose:
                 print("%.3f" % self.inertia_, end=" --> ")
             self._update_centroids(X)
-
             if numpy.abs(old_inertia - self.inertia_) < self.tol:
                 break
             old_inertia = self.inertia_
-        if self.verbose:
-            print("")
+            if self.verbose:
+                print("")
 
         self._iter = it + 1
-
         return self
 
     def _transform(self, X):
         metric_params = self._get_metric_params()
         if self.metric == "euclidean":
             return cdist(X.reshape((X.shape[0], -1)),
-                          self.cluster_centers_.reshape((self.n_clusters, -1)),
-                          metric="euclidean")
+                         self.cluster_centers_.reshape((self.n_clusters, -1)),
+                         metric="euclidean")
         elif self.metric == "dtw":
-            return cdist_dtw(X, self.cluster_centers_, n_jobs=self.n_jobs,
-                              verbose=self.verbose, **metric_params)
+            return cdist_dtw(X, self.cluster_centers_, n_jobs=self.n_jobs, global_constraint='sakoe_chiba',
+                             verbose=self.verbose, **metric_params)
         elif self.metric == "softdtw":
             return cdist_soft_dtw(X, self.cluster_centers_, **metric_params)
         else:
@@ -897,7 +897,7 @@ def _assign(self, X, update_class_attributes=True):
             _check_no_empty_cluster(self.labels_, self.n_clusters)
             if self.dtw_inertia and self.metric != "dtw":
                 inertia_dists = cdist_dtw(X, self.cluster_centers_,
-                                          n_jobs=self.n_jobs,
+                                          n_jobs=self.n_jobs, global_constraint='sakoe_chiba',
                                           verbose=self.verbose)
             else:
                 inertia_dists = dists
@@ -908,23 +908,37 @@ def _assign(self, X, update_class_attributes=True):
 
     def _update_centroids(self, X):
         metric_params = self._get_metric_params()
+        X_list = []
         for k in range(self.n_clusters):
-            if self.metric == "dtw":
-                self.cluster_centers_[k] = dtw_barycenter_averaging(
-                    X=X[self.labels_ == k],
-                    barycenter_size=None,
-                    init_barycenter=self.cluster_centers_[k],
-                    metric_params=metric_params,
-                    verbose=False)
-            elif self.metric == "softdtw":
-                self.cluster_centers_[k] = softdtw_barycenter(
-                    X=X[self.labels_ == k],
-                    max_iter=self.max_iter_barycenter,
-                    init=self.cluster_centers_[k],
-                    **metric_params)
-            else:
-                self.cluster_centers_[k] = euclidean_barycenter(
-                    X=X[self.labels_ == k])
+           X_list.append(X[self.labels_ == k])
+        cluster_centers = Parallel(backend='threading', prefer='threads', require='sharedmem', n_jobs=self.n_clusters, verbose=5)(delayed(dtw_barycenter_averaging)(
+						X=X_list[k],
+						barycenter_size=None,
+						init_barycenter=self.cluster_centers_[k],
+						metric_params=metric_params, max_iter = 3,
+						verbose=True) for k in range(self.n_clusters))
+        for k in range(self.n_clusters):
+            self.cluster_centers_[k] = cluster_centers[k]
+#    def _update_centroids(self, X):
+#        metric_params = self._get_metric_params()
+#        for k in range(self.n_clusters):
+#            if self.metric == "dtw":
+#                self.cluster_centers_[k] = (dtw_barycenter_averaging(
+#						X=X[self.labels_ == k],
+#						barycenter_size=None,
+#						init_barycenter=self.cluster_centers_[k],
+#						metric_params=metric_params, max_iter = 3,
+#						verbose=False))
+#           elif self.metric == "softdtw":
+#                self.cluster_centers_[k] = softdtw_barycenter(
+#						X=X[self.labels_ == k],
+#						max_iter=self.max_iter_barycenter,
+#						init=self.cluster_centers_[k],
+#						**metric_params)
+#            else:
+#                self.cluster_centers_[k] = euclidean_barycenter(
+#						X=X[self.labels_ == k])
+#        print(self.cluster_centers_[0])
 
     def fit(self, X, y=None):
         """Compute k-means clustering.
@@ -945,7 +959,6 @@ def fit(self, X, y=None):
                            extend=True,
                            check_n_features_only=(self.metric != "euclidean"))
 
-
         self.labels_ = None
         self.inertia_ = numpy.inf
         self.cluster_centers_ = None
@@ -956,7 +969,7 @@ def fit(self, X, y=None):
 
         max_attempts = max(self.n_init, 10)
 
-        X_ = to_time_series_dataset(X)
+        X_ = X
         rs = check_random_state(self.random_state)
 
         if self.init == "k-means++" and self.metric == "euclidean":
@@ -1034,7 +1047,7 @@ def predict(self, X):
 
     def transform(self, X):
         """Transform X to a cluster-distance space.
-        
+
         In the new space, each dimension is the distance to the cluster 
         centers.
 
diff --git a/tslearn/metrics.py b/tslearn/metrics.py
index 78d32947f..e87d603fe 100644
--- a/tslearn/metrics.py
+++ b/tslearn/metrics.py
@@ -1,7 +1,6 @@
 """
 The :mod:`tslearn.metrics` module delivers time-series specific metrics to be 
 used at the core of machine learning algorithms.
-
 **User guide:** See the :ref:`Dynamic Time Warping (DTW) <dtw>` section for 
 further details.
 """
@@ -20,6 +19,7 @@
 
 from tslearn.utils import to_time_series, to_time_series_dataset, ts_size, \
     check_equal_size
+from joblib import parallel_backend
 
 __author__ = 'Romain Tavenard romain.tavenard[at]univ-rennes2.fr'
 
@@ -40,23 +40,18 @@ def _local_squared_dist(x, y):
 @njit()
 def njit_accumulated_matrix(s1, s2, mask):
     """Compute the accumulated cost matrix score between two time series.
-
     Parameters
     ----------
     s1 : array, shape = (sz1,)
         First time series.
-
     s2 : array, shape = (sz2,)
         Second time series
-
     mask : array, shape = (sz1, sz2)
         Mask. Unconsidered cells must have infinite values.
-
     Returns
     -------
     mat : array, shape = (sz1, sz2)
         Accumulated cost matrix.
-
     """
     l1 = s1.shape[0]
     l2 = s2.shape[0]
@@ -76,23 +71,18 @@ def njit_accumulated_matrix(s1, s2, mask):
 @njit(nogil=True)
 def njit_dtw(s1, s2, mask):
     """Compute the dynamic time warping score between two time series.
-
     Parameters
     ----------
     s1 : array, shape = (sz1,)
         First time series.
-
     s2 : array, shape = (sz2,)
         Second time series
-
     mask : array, shape = (sz1, sz2)
         Mask. Unconsidered cells must have infinite values.
-
     Returns
     -------
     dtw_score : float
         Dynamic Time Warping score between both time series.
-
     """
     cum_sum = njit_accumulated_matrix(s1, s2, mask)
     return numpy.sqrt(cum_sum[-1, -1])
@@ -127,30 +117,22 @@ def dtw_path(s1, s2, global_constraint=None, sakoe_chiba_radius=None,
     r"""Compute Dynamic Time Warping (DTW) similarity measure between
     (possibly multidimensional) time series and return both the path and the
     similarity.
-
     DTW is computed as the Euclidean distance between aligned time series,
     i.e., if :math:`\pi` is the alignment path:
-
     .. math::
-
         DTW(X, Y) = \sqrt{\sum_{(i, j) \in \pi} (X_{i} - Y_{j})^2}
-
     It is not required that both time series share the same size, but they must
     be the same dimension. DTW was originally presented in [1]_ and is
     discussed in more details in our :ref:`dedicated user-guide page <dtw>`.
-
     Parameters
     ----------
     s1
         A time series.
-
     s2
         Another time series.
         If not given, self-similarity of dataset1 is returned.
-
     global_constraint : {"itakura", "sakoe_chiba"} or None (default: None)
         Global constraint to restrict admissible paths for DTW.
-
     sakoe_chiba_radius : int or None (default: None)
         Radius to be used for Sakoe-Chiba band global constraint.
         If None and `global_constraint` is set to "sakoe_chiba", a radius of
@@ -160,7 +142,6 @@ def dtw_path(s1, s2, global_constraint=None, sakoe_chiba_radius=None,
         two. In this case, if `global_constraint` corresponds to no global
         constraint, a `RuntimeWarning` is raised and no global constraint is
         used.
-
     itakura_max_slope : float or None (default: None)
         Maximum slope for the Itakura parallelogram constraint.
         If None and `global_constraint` is set to "itakura", a maximum slope
@@ -170,16 +151,13 @@ def dtw_path(s1, s2, global_constraint=None, sakoe_chiba_radius=None,
         two. In this case, if `global_constraint` corresponds to no global
         constraint, a `RuntimeWarning` is raised and no global constraint is
         used.
-
     Returns
     -------
     list of integer pairs
         Matching path represented as a list of index pairs. In each pair, the
         first index corresponds to s1 and the second one corresponds to s2
-
     float
         Similarity score
-
     Examples
     --------
     >>> path, dist = dtw_path([1, 2, 3], [1., 2., 2., 3.])
@@ -189,19 +167,16 @@ def dtw_path(s1, s2, global_constraint=None, sakoe_chiba_radius=None,
     0.0
     >>> dtw_path([1, 2, 3], [1., 2., 2., 3., 4.])[1]
     1.0
-
     See Also
     --------
     dtw : Get only the similarity score for DTW
     cdist_dtw : Cross similarity matrix between time series datasets
     dtw_path_from_metric : Compute a DTW using a user-defined distance metric
-
     References
     ----------
     .. [1] H. Sakoe, S. Chiba, "Dynamic programming algorithm optimization for
            spoken word recognition," IEEE Transactions on Acoustics, Speech and
            Signal Processing, vol. 26(1), pp. 43--49, 1978.
-
     """
     s1 = to_time_series(s1, remove_nans=True)
     s2 = to_time_series(s2, remove_nans=True)
@@ -219,20 +194,16 @@ def dtw_path(s1, s2, global_constraint=None, sakoe_chiba_radius=None,
 def njit_accumulated_matrix_from_dist_matrix(dist_matrix, mask):
     """Compute the accumulated cost matrix score between two time series using
     a precomputed distance matrix.
-
     Parameters
     ----------
     dist_matrix : array, shape = (sz1, sz2)
         Array containing the pairwise distances.
-
     mask : array, shape = (sz1, sz2)
         Mask. Unconsidered cells must have infinite values.
-
     Returns
     -------
     mat : array, shape = (sz1, sz2)
         Accumulated cost matrix.
-
     """
     l1, l2 = dist_matrix.shape
     cum_sum = numpy.full((l1 + 1, l2 + 1), numpy.inf)
@@ -254,44 +225,33 @@ def dtw_path_from_metric(s1, s2=None, metric="euclidean",
     r"""Compute Dynamic Time Warping (DTW) similarity measure between
     (possibly multidimensional) time series using a distance metric defined by
     the user and return both the path and the similarity.
-
     Similarity is computed as the cumulative cost along the aligned time
     series.
-
     It is not required that both time series share the same size, but they must
     be the same dimension. DTW was originally presented in [1]_.
-
     Valid values for metric are the same as for scikit-learn
     `pairwise_distances`_ function i.e. a string (e.g. "euclidean",
     "sqeuclidean", "hamming") or a function that is used to compute the
     pairwise distances. See `scikit`_ and `scipy`_ documentations for more
     information about the available metrics.
-
     Parameters
     ----------
     s1 : array, shape = (sz1, d) if metric!="precomputed", (sz1, sz2) otherwise
         A time series or an array of pairwise distances between samples.
-
     s2 : array, shape = (sz2, d), optional (default: None)
         A second time series, only allowed if metric != "precomputed".
-
     metric : string or callable (default: "euclidean")
         Function used to compute the pairwise distances between each points of
         `s1` and `s2`.
-
         If metric is "precomputed", `s1` is assumed to be a distance matrix.
-
         If metric is an other string, it must be one of the options compatible
         with sklearn.metrics.pairwise_distances.
-
         Alternatively, if metric is a callable function, it is called on pairs
         of rows of `s1` and `s2`. The callable should take two 1 dimensional
         arrays as input and return a value indicating the distance between
         them.
-
     global_constraint : {"itakura", "sakoe_chiba"} or None (default: None)
         Global constraint to restrict admissible paths for DTW.
-
     sakoe_chiba_radius : int or None (default: None)
         Radius to be used for Sakoe-Chiba band global constraint.
         If None and `global_constraint` is set to "sakoe_chiba", a radius of
@@ -301,7 +261,6 @@ def dtw_path_from_metric(s1, s2=None, metric="euclidean",
         two. In this case, if `global_constraint` corresponds to no global
         constraint, a `RuntimeWarning` is raised and no global constraint is
         used.
-
     itakura_max_slope : float or None (default: None)
         Maximum slope for the Itakura parallelogram constraint.
         If None and `global_constraint` is set to "itakura", a maximum slope
@@ -311,71 +270,53 @@ def dtw_path_from_metric(s1, s2=None, metric="euclidean",
         two. In this case, if `global_constraint` corresponds to no global
         constraint, a `RuntimeWarning` is raised and no global constraint is
         used.
-
     **kwds
         Additional arguments to pass to sklearn pairwise_distances to compute
         the pairwise distances.
-
     Returns
     -------
     list of integer pairs
         Matching path represented as a list of index pairs. In each pair, the
         first index corresponds to s1 and the second one corresponds to s2.
-
     float
         Similarity score (sum of metric along the wrapped time series).
-
     Examples
     --------
     Lets create 2 numpy arrays to wrap:
-
     >>> import numpy as np
     >>> rng = np.random.RandomState(0)
     >>> s1, s2 = rng.rand(5, 2), rng.rand(6, 2)
-
     The wrapping can be done by passing a string indicating the metric to pass
     to scikit-learn pairwise_distances:
-
     >>> dtw_path_from_metric(s1, s2,
     ...                      metric="sqeuclidean")  # doctest: +ELLIPSIS
     ([(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5)], 1.117...)
-
     Or by defining a custom distance function:
-
     >>> sqeuclidean = lambda x, y: np.sum((x-y)**2)
     >>> dtw_path_from_metric(s1, s2, metric=sqeuclidean)  # doctest: +ELLIPSIS
     ([(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5)], 1.117...)
-
     Or by using a precomputed distance matrix as input:
-
     >>> from sklearn.metrics.pairwise import pairwise_distances
     >>> dist_matrix = pairwise_distances(s1, s2, metric="sqeuclidean")
     >>> dtw_path_from_metric(dist_matrix,
     ...                      metric="precomputed")  # doctest: +ELLIPSIS
     ([(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5)], 1.117...)
-
     Notes
     --------
     By using a squared euclidean distance metric as shown above, the output
     path is the same as the one obtained by using dtw_path but the similarity
     score is the sum of squared distances instead of the euclidean distance.
-
     See Also
     --------
     dtw_path : Get both the matching path and the similarity score for DTW
-
     References
     ----------
     .. [1] H. Sakoe, S. Chiba, "Dynamic programming algorithm optimization for
            spoken word recognition," IEEE Transactions on Acoustics, Speech and
            Signal Processing, vol. 26(1), pp. 43--49, 1978.
-
     .. _pairwise_distances: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
-
     .. _scikit: https://scikit-learn.org/stable/modules/metrics.html
-
     .. _scipy: https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
-
     """  # noqa: E501
     if metric == "precomputed":  # Pairwise distance given as input
         sz1, sz2 = s1.shape
@@ -402,31 +343,22 @@ def dtw(s1, s2, global_constraint=None, sakoe_chiba_radius=None,
         itakura_max_slope=None):
     r"""Compute Dynamic Time Warping (DTW) similarity measure between
     (possibly multidimensional) time series and return it.
-
     DTW is computed as the Euclidean distance between aligned time series,
     i.e., if :math:`\pi` is the optimal alignment path:
-
     .. math::
-
         DTW(X, Y) = \sqrt{\sum_{(i, j) \in \pi} \|X_{i} - Y_{j}\|^2}
-
     Note that this formula is still valid for the multivariate case.
-
     It is not required that both time series share the same size, but they must
     be the same dimension. DTW was originally presented in [1]_ and is
     discussed in more details in our :ref:`dedicated user-guide page <dtw>`.
-
     Parameters
     ----------
     s1
         A time series.
-
     s2
         Another time series.
-
     global_constraint : {"itakura", "sakoe_chiba"} or None (default: None)
         Global constraint to restrict admissible paths for DTW.
-
     sakoe_chiba_radius : int or None (default: None)
         Radius to be used for Sakoe-Chiba band global constraint.
         If None and `global_constraint` is set to "sakoe_chiba", a radius of
@@ -436,7 +368,6 @@ def dtw(s1, s2, global_constraint=None, sakoe_chiba_radius=None,
         two. In this case, if `global_constraint` corresponds to no global
         constraint, a `RuntimeWarning` is raised and no global constraint is
         used.
-
     itakura_max_slope : float or None (default: None)
         Maximum slope for the Itakura parallelogram constraint.
         If None and `global_constraint` is set to "itakura", a maximum slope
@@ -446,30 +377,25 @@ def dtw(s1, s2, global_constraint=None, sakoe_chiba_radius=None,
         two. In this case, if `global_constraint` corresponds to no global
         constraint, a `RuntimeWarning` is raised and no global constraint is
         used.
-
     Returns
     -------
     float
         Similarity score
-
     Examples
     --------
     >>> dtw([1, 2, 3], [1., 2., 2., 3.])
     0.0
     >>> dtw([1, 2, 3], [1., 2., 2., 3., 4.])
     1.0
-
     See Also
     --------
     dtw_path : Get both the matching path and the similarity score for DTW
     cdist_dtw : Cross similarity matrix between time series datasets
-
     References
     ----------
     .. [1] H. Sakoe, S. Chiba, "Dynamic programming algorithm optimization for
            spoken word recognition," IEEE Transactions on Acoustics, Speech and
            Signal Processing, vol. 26(1), pp. 43--49, 1978.
-
     """
     s1 = to_time_series(s1, remove_nans=True)
     s2 = to_time_series(s2, remove_nans=True)
@@ -485,24 +411,18 @@ def dtw(s1, s2, global_constraint=None, sakoe_chiba_radius=None,
 def _max_steps(i, j, max_length, length_1, length_2):
     """Maximum number of steps required in a L-DTW process to reach a given
     cell.
-
     Parameters
     ----------
     i : int
         Cell row index
-
     j : int
         Cell column index
-
     max_length : int
         Maximum allowed length
-
     length_1 : int
         Length of the first time series
-
     length_2 : int
         Length of the second time series
-
     Returns
     -------
     int
@@ -515,19 +435,15 @@ def _max_steps(i, j, max_length, length_1, length_2):
 
 def _limited_warping_length_cost(s1, s2, max_length):
     r"""Compute accumulated scores necessary fo L-DTW.
-
     Parameters
     ----------
     s1
         A time series.
-
     s2
         Another time series.
-
     max_length : int
         Maximum allowed warping path length. Should be an integer between
         XXX and YYY.  # TODO
-
     Returns
     -------
     dict
@@ -567,55 +483,43 @@ def dtw_limited_warping_length(s1, s2, max_length):
     r"""Compute Dynamic Time Warping (DTW) similarity measure between
     (possibly multidimensional) time series under an upper bound constraint on
     the resulting path length and return the similarity cost.
-
     DTW is computed as the Euclidean distance between aligned time series,
     i.e., if :math:`\pi` is the optimal alignment path:
-
     .. math::
-
         DTW(X, Y) = \sqrt{\sum_{(i, j) \in \pi} \|X_{i} - Y_{j}\|^2}
-
     Note that this formula is still valid for the multivariate case.
-
     It is not required that both time series share the same size, but they must
     be the same dimension. DTW was originally presented in [1]_.
     This constrained-length variant was introduced in [2]_.
     Both bariants are
     discussed in more details in our :ref:`dedicated user-guide page <dtw>`
-
     Parameters
     ----------
     s1
         A time series.
-
     s2
         Another time series.
-
     max_length : int
         Maximum allowed warping path length.
         If greater than len(s1) + len(s2), then it is equivalent to
         unconstrained DTW.
         If lower than max(len(s1), len(s2)), no path can be found and a
         ValueError is raised.
-
     Returns
     -------
     float
         Similarity score
-
     Examples
     --------
     >>> dtw_limited_warping_length([1, 2, 3], [1., 2., 2., 3.], 5)
     0.0
     >>> dtw_limited_warping_length([1, 2, 3], [1., 2., 2., 3., 4.], 5)
     1.0
-
     See Also
     --------
     dtw : Get the similarity score for DTW
     dtw_path_limited_warping_length : Get both the warping path and the
         similarity score for DTW with limited warping path length
-
     References
     ----------
     .. [1] H. Sakoe, S. Chiba, "Dynamic programming algorithm optimization for
@@ -670,45 +574,34 @@ def dtw_path_limited_warping_length(s1, s2, max_length):
     (possibly multidimensional) time series under an upper bound constraint on
     the resulting path length and return the path as well as the similarity
     cost.
-
     DTW is computed as the Euclidean distance between aligned time series,
     i.e., if :math:`\pi` is the optimal alignment path:
-
     .. math::
-
         DTW(X, Y) = \sqrt{\sum_{(i, j) \in \pi} \|X_{i} - Y_{j}\|^2}
-
     Note that this formula is still valid for the multivariate case.
-
     It is not required that both time series share the same size, but they must
     be the same dimension. DTW was originally presented in [1]_.
     This constrained-length variant was introduced in [2]_.
     Both variants are
     discussed in more details in our :ref:`dedicated user-guide page <dtw>`
-
     Parameters
     ----------
     s1
         A time series.
-
     s2
         Another time series.
-
     max_length : int
         Maximum allowed warping path length.
         If greater than len(s1) + len(s2), then it is equivalent to
         unconstrained DTW.
         If lower than max(len(s1), len(s2)), no path can be found and a
         ValueError is raised.
-
     Returns
     -------
     list of integer pairs
         Optimal path
-
     float
         Similarity score
-
     Examples
     --------
     >>> path, cost = dtw_path_limited_warping_length([1, 2, 3],
@@ -723,13 +616,11 @@ def dtw_path_limited_warping_length(s1, s2, max_length):
     1.0
     >>> path
     [(0, 0), (1, 1), (1, 2), (2, 3), (2, 4)]
-
     See Also
     --------
     dtw_limited_warping_length : Get the similarity score for DTW with limited
         warping path length
     dtw_path : Get both the matching path and the similarity score for DTW
-
     References
     ----------
     .. [1] H. Sakoe, S. Chiba, "Dynamic programming algorithm optimization for
@@ -779,15 +670,12 @@ def _subsequence_cost_matrix(subseq, longseq):
 def subsequence_cost_matrix(subseq, longseq):
     """Compute the accumulated cost matrix score between a subsequence and
     a reference time series.
-
     Parameters
     ----------
     subseq : array, shape = (sz1, d)
         Subsequence time series.
-
     longseq : array, shape = (sz2, d)
         Reference time series
-
     Returns
     -------
     mat : array, shape = (sz1, sz2)
@@ -823,7 +711,6 @@ def _subsequence_path(acc_cost_mat, idx_path_end):
 def subsequence_path(acc_cost_mat, idx_path_end):
     r"""Compute the optimal path through a accumulated cost matrix given the
     endpoint of the sequence.
-
     Parameters
     ----------
     acc_cost_mat: array, shape = (sz1, sz2)
@@ -831,7 +718,6 @@ def subsequence_path(acc_cost_mat, idx_path_end):
         sequence.
     idx_path_end: int
         The end position of the matched subsequence in the longer sequence.
-
     Returns
     -------
     path: list of tuples of integer pairs
@@ -839,10 +725,8 @@ def subsequence_path(acc_cost_mat, idx_path_end):
         first index corresponds to `subseq` and the second one corresponds to
         `longseq`. The startpoint of the Path is :math:`P_0 = (0, ?)` and it
         ends at :math:`P_L = (len(subseq)-1, idx\_path\_end)`
-
     Examples
     --------
-
     >>> acc_cost_mat = numpy.array([[1., 0., 0., 1., 4.],
     ...                             [5., 1., 1., 0., 1.]])
     >>> # calculate the globally optimal path
@@ -850,12 +734,10 @@ def subsequence_path(acc_cost_mat, idx_path_end):
     >>> path = subsequence_path(acc_cost_mat, optimal_end_point)
     >>> path
     [(0, 2), (1, 3)]
-
     See Also
     --------
     dtw_subsequence_path : Get the similarity score for DTW
     subsequence_cost_matrix: Calculate the required cost matrix
-
     """
     return _subsequence_path(acc_cost_mat, idx_path_end)
 
@@ -864,30 +746,23 @@ def dtw_subsequence_path(subseq, longseq):
     r"""Compute sub-sequence Dynamic Time Warping (DTW) similarity measure
     between a (possibly multidimensional) query and a long time series and
     return both the path and the similarity.
-
     DTW is computed as the Euclidean distance between aligned time series,
     i.e., if :math:`\pi` is the alignment path:
-
     .. math::
-
         DTW(X, Y) = \sqrt{\sum_{(i, j) \in \pi} \|X_{i} - Y_{j}\|^2}
-
     Compared to traditional DTW, here, border constraints on admissible paths
     :math:`\pi` are relaxed such that :math:`\pi_0 = (0, ?)` and
     :math:`\pi_L = (N-1, ?)` where :math:`L` is the length of the considered
     path and :math:`N` is the length of the subsequence time series.
-
     It is not required that both time series share the same size, but they must
     be the same dimension. This implementation finds the best matching starting
     and ending positions for `subseq` inside `longseq`.
-
     Parameters
     ----------
     subseq : array, shape = (sz1, d)
         A query time series.
     longseq : array, shape = (sz2, d)
         A reference (supposed to be longer than `subseq`) time series.
-
     Returns
     -------
     list of integer pairs
@@ -896,7 +771,6 @@ def dtw_subsequence_path(subseq, longseq):
         `longseq`.
     float
         Similarity score
-
     Examples
     --------
     >>> path, dist = dtw_subsequence_path([2., 3.], [1., 2., 2., 3., 4.])
@@ -904,7 +778,6 @@ def dtw_subsequence_path(subseq, longseq):
     [(0, 2), (1, 3)]
     >>> dist
     0.0
-
     See Also
     --------
     dtw : Get the similarity score for DTW
@@ -923,23 +796,18 @@ def dtw_subsequence_path(subseq, longseq):
 @njit()
 def sakoe_chiba_mask(sz1, sz2, radius=1):
     """Compute the Sakoe-Chiba mask.
-
     Parameters
     ----------
     sz1 : int
         The size of the first time series
-
     sz2 : int
         The size of the second time series.
-
     radius : int
         The radius of the band.
-
     Returns
     -------
     mask : array, shape = (sz1, sz2)
         Sakoe-Chiba mask.
-
     Examples
     --------
     >>> sakoe_chiba_mask(4, 4, 1)
@@ -976,18 +844,14 @@ def sakoe_chiba_mask(sz1, sz2, radius=1):
 def _njit_itakura_mask(sz1, sz2, max_slope=2.):
     """Compute the Itakura mask without checking that the constraints
     are feasible. In most cases, you should use itakura_mask instead.
-
     Parameters
     ----------
     sz1 : int
         The size of the first time series
-
     sz2 : int
         The size of the second time series.
-
     max_slope : float (default = 2)
         The maximum slope of the parallelogram.
-
     Returns
     -------
     mask : array, shape = (sz1, sz2)
@@ -1025,23 +889,18 @@ def _njit_itakura_mask(sz1, sz2, max_slope=2.):
 
 def itakura_mask(sz1, sz2, max_slope=2.):
     """Compute the Itakura mask.
-
     Parameters
     ----------
     sz1 : int
         The size of the first time series
-
     sz2 : int
         The size of the second time series.
-
     max_slope : float (default = 2)
         The maximum slope of the parallelogram.
-
     Returns
     -------
     mask : array, shape = (sz1, sz2)
         Itakura mask.
-
     Examples
     --------
     >>> itakura_mask(6, 6)
@@ -1077,21 +936,17 @@ def itakura_mask(sz1, sz2, max_slope=2.):
 def compute_mask(s1, s2, global_constraint=0,
                  sakoe_chiba_radius=None, itakura_max_slope=None):
     """Compute the mask (region constraint).
-
     Parameters
     ----------
     s1 : array
         A time series or integer.
-
     s2: array
         Another time series or integer.
-
     global_constraint : {0, 1, 2} (default: 0)
         Global constraint to restrict admissible paths for DTW:
         - "itakura" if 1
         - "sakoe_chiba" if 2
         - no constraint otherwise
-
     sakoe_chiba_radius : int or None (default: None)
         Radius to be used for Sakoe-Chiba band global constraint.
         If None and `global_constraint` is set to 2 (sakoe-chiba), a radius of
@@ -1101,7 +956,6 @@ def compute_mask(s1, s2, global_constraint=0,
         two. In this case, if `global_constraint` corresponds to no global
         constraint, a `RuntimeWarning` is raised and no global constraint is
         used.
-
     itakura_max_slope : float or None (default: None)
         Maximum slope for the Itakura parallelogram constraint.
         If None and `global_constraint` is set to 1 (itakura), a maximum slope
@@ -1111,7 +965,6 @@ def compute_mask(s1, s2, global_constraint=0,
         two. In this case, if `global_constraint` corresponds to no global
         constraint, a `RuntimeWarning` is raised and no global constraint is
         used.
-
     Returns
     -------
     mask : array
@@ -1148,26 +1001,21 @@ def _cdist_generic(dist_fun, dataset1, dataset2, n_jobs, verbose,
                    compute_diagonal=True, dtype=numpy.float, *args, **kwargs):
     """Compute cross-similarity matrix with joblib parallelization for a given
     similarity function.
-
     Parameters
     ----------
     dist_fun : function
         Similarity function to be used
-
     dataset1 : array-like
         A dataset of time series
-
     dataset2 : array-like (default: None)
         Another dataset of time series. If `None`, self-similarity of
         `dataset1` is returned.
-
     n_jobs : int or None, optional (default=None)
         The number of jobs to run in parallel.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See scikit-learns'
         `Glossary <https://scikit-learn.org/stable/glossary.html#term-n-jobs>`__
         for more details.
-
     verbose : int, optional (default=0)
         The verbosity level: if non zero, progress messages are printed.
         Above 50, the output is sent to stdout.
@@ -1175,21 +1023,17 @@ def _cdist_generic(dist_fun, dataset1, dataset2, n_jobs, verbose,
         If it more than 10, all iterations are reported.
         `Glossary <https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation>`__
         for more details.
-
     compute_diagonal : bool (default: True)
         Whether diagonal terms should be computed or assumed to be 0 in the
         self-similarity case. Used only if `dataset2` is `None`.
-
     *args and **kwargs :
         Optional additional parameters to be passed to the similarity function.
-
-
     Returns
     -------
     cdist : numpy.ndarray
         Cross-similarity matrix
     """  # noqa: E501
-    dataset1 = to_time_series_dataset(dataset1, dtype=dtype)
+    #dataset1 = to_time_series_dataset(dataset1, dtype=dtype)
 
     if dataset2 is None:
         # Inspired from code by @GillesVandewiele:
@@ -1199,7 +1043,7 @@ def _cdist_generic(dist_fun, dataset1, dataset2, n_jobs, verbose,
                                      k=0 if compute_diagonal else 1,
                                      m=len(dataset1))
         matrix[indices] = Parallel(n_jobs=n_jobs,
-                                   prefer="threads",
+                                   prefer="processes",
                                    verbose=verbose)(
             delayed(dist_fun)(
                 dataset1[i], dataset1[j],
@@ -1213,14 +1057,15 @@ def _cdist_generic(dist_fun, dataset1, dataset2, n_jobs, verbose,
         matrix[indices] = matrix.T[indices]
         return matrix
     else:
-        dataset2 = to_time_series_dataset(dataset2, dtype=dtype)
-        matrix = Parallel(n_jobs=n_jobs, prefer="threads", verbose=verbose)(
-            delayed(dist_fun)(
-                dataset1[i], dataset2[j],
-                *args, **kwargs
+        #dataset2 = to_time_series_dataset(dataset2, dtype=dtype)
+        with parallel_backend('loky'):
+            matrix = Parallel(backend='loky', n_jobs=n_jobs, prefer="processes", verbose=verbose)(
+                delayed(dist_fun)(
+                    dataset1[i], dataset2[j],
+                    *args, **kwargs
+                )
+                for i in range(len(dataset1)) for j in range(len(dataset2))
             )
-            for i in range(len(dataset1)) for j in range(len(dataset2))
-        )
         return numpy.array(matrix).reshape((len(dataset1), -1))
 
 
@@ -1229,29 +1074,21 @@ def cdist_dtw(dataset1, dataset2=None, global_constraint=None,
               verbose=0):
     r"""Compute cross-similarity matrix using Dynamic Time Warping (DTW)
     similarity measure.
-
     DTW is computed as the Euclidean distance between aligned time series,
     i.e., if :math:`\pi` is the alignment path:
-
     .. math::
-
         DTW(X, Y) = \sqrt{\sum_{(i, j) \in \pi} (X_{i} - Y_{j})^2}
-
     DTW was originally presented in [1]_ and is
     discussed in more details in our :ref:`dedicated user-guide page <dtw>`.
-
     Parameters
     ----------
     dataset1 : array-like
         A dataset of time series
-
     dataset2 : array-like (default: None)
         Another dataset of time series. If `None`, self-similarity of
         `dataset1` is returned.
-
     global_constraint : {"itakura", "sakoe_chiba"} or None (default: None)
         Global constraint to restrict admissible paths for DTW.
-
     sakoe_chiba_radius : int or None (default: None)
         Radius to be used for Sakoe-Chiba band global constraint.
         If None and `global_constraint` is set to "sakoe_chiba", a radius of
@@ -1261,7 +1098,6 @@ def cdist_dtw(dataset1, dataset2=None, global_constraint=None,
         two. In this case, if `global_constraint` corresponds to no global
         constraint, a `RuntimeWarning` is raised and no global constraint is
         used.
-
     itakura_max_slope : float or None (default: None)
         Maximum slope for the Itakura parallelogram constraint.
         If None and `global_constraint` is set to "itakura", a maximum slope
@@ -1271,14 +1107,12 @@ def cdist_dtw(dataset1, dataset2=None, global_constraint=None,
         two. In this case, if `global_constraint` corresponds to no global
         constraint, a `RuntimeWarning` is raised and no global constraint is
         used.
-
     n_jobs : int or None, optional (default=None)
         The number of jobs to run in parallel.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See scikit-learns'
         `Glossary <https://scikit-learn.org/stable/glossary.html#term-n-jobs>`__
         for more details.
-
     verbose : int, optional (default=0)
         The verbosity level: if non zero, progress messages are printed.
         Above 50, the output is sent to stdout.
@@ -1286,12 +1120,10 @@ def cdist_dtw(dataset1, dataset2=None, global_constraint=None,
         If it more than 10, all iterations are reported.
         `Glossary <https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation>`__
         for more details.
-
     Returns
     -------
     cdist : numpy.ndarray
         Cross-similarity matrix
-
     Examples
     --------
     >>> cdist_dtw([[1, 2, 2, 3], [1., 2., 3., 4.]])
@@ -1300,11 +1132,9 @@ def cdist_dtw(dataset1, dataset2=None, global_constraint=None,
     >>> cdist_dtw([[1, 2, 2, 3], [1., 2., 3., 4.]], [[1, 2, 3], [2, 3, 4, 5]])
     array([[0.        , 2.44948974],
            [1.        , 1.41421356]])
-
     See Also
     --------
     dtw : Get DTW similarity score
-
     References
     ----------
     .. [1] H. Sakoe, S. Chiba, "Dynamic programming algorithm optimization for
@@ -1345,12 +1175,10 @@ def _gak_gram(s1, s2, sigma=1.):
 def unnormalized_gak(s1, s2, sigma=1.):
     r"""Compute Global Alignment Kernel (GAK) between (possibly
     multidimensional) time series and return it.
-
     It is not required that both time series share the same size, but they must
     be the same dimension. GAK was
     originally presented in [1]_.
     This is an unnormalized version.
-
     Parameters
     ----------
     s1
@@ -1359,12 +1187,10 @@ def unnormalized_gak(s1, s2, sigma=1.):
         Another time series
     sigma : float (default 1.)
         Bandwidth of the internal gaussian kernel used for GAK
-
     Returns
     -------
     float
         Kernel value
-
     Examples
     --------
     >>> unnormalized_gak([1, 2, 3],
@@ -1374,12 +1200,10 @@ def unnormalized_gak(s1, s2, sigma=1.):
     >>> unnormalized_gak([1, 2, 3],
     ...                  [1., 2., 2., 3., 4.])  # doctest: +ELLIPSIS
     3.166...
-
     See Also
     --------
     gak : normalized version of GAK that ensures that k(x,x) = 1
     cdist_gak : Compute cross-similarity matrix using Global Alignment kernel
-
     References
     ----------
     .. [1] M. Cuturi, "Fast global alignment kernels," ICML 2011.
@@ -1396,13 +1220,11 @@ def unnormalized_gak(s1, s2, sigma=1.):
 def gak(s1, s2, sigma=1.):  # TODO: better doc (formula for the kernel)
     r"""Compute Global Alignment Kernel (GAK) between (possibly
     multidimensional) time series and return it.
-
     It is not required that both time series share the same size, but they must
     be the same dimension. GAK was
     originally presented in [1]_.
     This is a normalized version that ensures that :math:`k(x,x)=1` for all
     :math:`x` and :math:`k(x,y) \in [0, 1]` for all :math:`x, y`.
-
     Parameters
     ----------
     s1
@@ -1411,23 +1233,19 @@ def gak(s1, s2, sigma=1.):  # TODO: better doc (formula for the kernel)
         Another time series
     sigma : float (default 1.)
         Bandwidth of the internal gaussian kernel used for GAK
-
     Returns
     -------
     float
         Kernel value
-
     Examples
     --------
     >>> gak([1, 2, 3], [1., 2., 2., 3.], sigma=2.)  # doctest: +ELLIPSIS
     0.839...
     >>> gak([1, 2, 3], [1., 2., 2., 3., 4.])  # doctest: +ELLIPSIS
     0.273...
-
     See Also
     --------
     cdist_gak : Compute cross-similarity matrix using Global Alignment kernel
-
     References
     ----------
     .. [1] M. Cuturi, "Fast global alignment kernels," ICML 2011.
@@ -1439,9 +1257,7 @@ def gak(s1, s2, sigma=1.):  # TODO: better doc (formula for the kernel)
 
 def cdist_gak(dataset1, dataset2=None, sigma=1., n_jobs=None, verbose=0):
     r"""Compute cross-similarity matrix using Global Alignment kernel (GAK).
-
     GAK was originally presented in [1]_.
-
     Parameters
     ----------
     dataset1
@@ -1463,12 +1279,10 @@ def cdist_gak(dataset1, dataset2=None, sigma=1., n_jobs=None, verbose=0):
         If it more than 10, all iterations are reported.
         `Glossary <https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation>`__
         for more details.
-
     Returns
     -------
     numpy.ndarray
         Cross-similarity matrix
-
     Examples
     --------
     >>> cdist_gak([[1, 2, 2, 3], [1., 2., 3., 4.]], sigma=2.)
@@ -1479,11 +1293,9 @@ def cdist_gak(dataset1, dataset2=None, sigma=1., n_jobs=None, verbose=0):
     ...           sigma=2.)
     array([[0.71059484, 0.29722877, 0.71059484],
            [0.65629661, 1.        , 0.65629661]])
-
     See Also
     --------
     gak : Compute Global Alignment kernel
-
     References
     ----------
     .. [1] M. Cuturi, "Fast global alignment kernels," ICML 2011.
@@ -1502,13 +1314,13 @@ def cdist_gak(dataset1, dataset2=None, sigma=1., n_jobs=None, verbose=0):
     else:
         dataset2 = to_time_series_dataset(dataset2)
         diagonal_left = Parallel(n_jobs=n_jobs,
-                                 prefer="threads",
+                                 prefer="processes",
                                  verbose=verbose)(
             delayed(unnormalized_gak)(dataset1[i], dataset1[i], sigma=sigma)
             for i in range(len(dataset1))
         )
         diagonal_right = Parallel(n_jobs=n_jobs,
-                                  prefer="threads",
+                                  prefer="processes",
                                   verbose=verbose)(
             delayed(unnormalized_gak)(dataset2[j], dataset2[j], sigma=sigma)
             for j in range(len(dataset2))
@@ -1520,9 +1332,7 @@ def cdist_gak(dataset1, dataset2=None, sigma=1., n_jobs=None, verbose=0):
 
 def sigma_gak(dataset, n_samples=100, random_state=None):
     r"""Compute sigma value to be used for GAK.
-
     This method was originally presented in [1]_.
-
     Parameters
     ----------
     dataset
@@ -1532,12 +1342,10 @@ def sigma_gak(dataset, n_samples=100, random_state=None):
     random_state : integer or numpy.RandomState or None (default: None)
         The generator used to draw the samples. If an integer is given, it
         fixes the seed. Defaults to the global numpy random number generator.
-
     Returns
     -------
     float
         Suggested bandwidth (:math:`\sigma`) for the Global Alignment kernel
-
     Examples
     --------
     >>> dataset = [[1, 2, 2, 3], [1., 2., 3., 4.]]
@@ -1545,12 +1353,10 @@ def sigma_gak(dataset, n_samples=100, random_state=None):
     ...           n_samples=200,
     ...           random_state=0)  # doctest: +ELLIPSIS
     2.0...
-
     See Also
     --------
     gak : Compute Global Alignment kernel
     cdist_gak : Compute cross-similarity matrix using Global Alignment kernel
-
     References
     ----------
     .. [1] M. Cuturi, "Fast global alignment kernels," ICML 2011.
@@ -1574,9 +1380,7 @@ def sigma_gak(dataset, n_samples=100, random_state=None):
 
 def gamma_soft_dtw(dataset, n_samples=100, random_state=None):
     r"""Compute gamma value to be used for GAK/Soft-DTW.
-
     This method was originally presented in [1]_.
-
     Parameters
     ----------
     dataset
@@ -1586,12 +1390,10 @@ def gamma_soft_dtw(dataset, n_samples=100, random_state=None):
     random_state : integer or numpy.RandomState or None (default: None)
         The generator used to draw the samples. If an integer is given, it
         fixes the seed. Defaults to the global numpy random number generator.
-
     Returns
     -------
     float
         Suggested :math:`\gamma` parameter for the Soft-DTW
-
     Examples
     --------
     >>> dataset = [[1, 2, 2, 3], [1., 2., 3., 4.]]
@@ -1599,11 +1401,9 @@ def gamma_soft_dtw(dataset, n_samples=100, random_state=None):
     ...                n_samples=200,
     ...                random_state=0)  # doctest: +ELLIPSIS
     8.0...
-
     See Also
     --------
     sigma_gak : Compute sigma parameter for Global Alignment kernel
-
     References
     ----------
     .. [1] M. Cuturi, "Fast global alignment kernels," ICML 2011.
@@ -1619,30 +1419,24 @@ def cdist_sax(dataset1, breakpoints_avg, size_fitted, dataset2=None,
     as presented in [1]_. It is important to note that this function
     expects the timeseries in dataset1 and dataset2 to be normalized
     to each have zero mean and unit variance.
-
     Parameters
     ----------
     dataset1 : array-like
         A dataset of time series
-
     breakpoints_avg : array-like
         The breakpoints used to assign the alphabet symbols.
-
     size_fitted: int
         The original timesteps in the timeseries, before
         discretizing through SAX.
-
     dataset2 : array-like (default: None)
         Another dataset of time series. If `None`, self-similarity of
         `dataset1` is returned.
-
     n_jobs : int or None, optional (default=None)
         The number of jobs to run in parallel.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See scikit-learns'
         `Glossary <https://scikit-learn.org/stable/glossary.html#term-n-jobs>`__
         for more details.
-
     verbose : int, optional (default=0)
         The verbosity level: if non zero, progress messages are printed.
         Above 50, the output is sent to stdout.
@@ -1650,18 +1444,15 @@ def cdist_sax(dataset1, breakpoints_avg, size_fitted, dataset2=None,
         If it more than 10, all iterations are reported.
         `Glossary <https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation>`__
         for more details.
-
     Returns
     -------
     cdist : numpy.ndarray
         Cross-similarity matrix
-
     References
     ----------
     .. [1] Lin, Jessica, et al. "Experiencing SAX: a novel symbolic
            representation of time series." Data Mining and knowledge
            discovery 15.2 (2007): 107-144.
-
     """  # noqa: E501
     return _cdist_generic(cydist_sax, dataset1, dataset2, n_jobs, verbose,
                           False, int, breakpoints_avg, size_fitted)
@@ -1669,9 +1460,7 @@ def cdist_sax(dataset1, breakpoints_avg, size_fitted, dataset2=None,
 
 def lb_keogh(ts_query, ts_candidate=None, radius=1, envelope_candidate=None):
     r"""Compute LB_Keogh.
-
     LB_Keogh was originally presented in [1]_.
-
     Parameters
     ----------
     ts_query : array-like
@@ -1690,18 +1479,15 @@ def lb_keogh(ts_query, ts_candidate=None, radius=1, envelope_candidate=None):
     (default: None)
         Pre-computed envelope of the candidate time series. If set to None, it
         is computed based on `ts_candidate`.
-
     Notes
     -----
         This method requires a `ts_query` and `ts_candidate` (or
         `envelope_candidate`, depending on the call) to be of equal size.
-
     Returns
     -------
     float
         Distance between the query time series and the envelope of the
         candidate time series.
-
     Examples
     --------
     >>> ts1 = [1, 2, 3, 2, 1]
@@ -1714,11 +1500,9 @@ def lb_keogh(ts_query, ts_candidate=None, radius=1, envelope_candidate=None):
     ...          ts_candidate=ts1,
     ...          radius=1)  # doctest: +ELLIPSIS
     2.8284...
-
     See also
     --------
     lb_envelope : Compute LB_Keogh-related envelope
-
     References
     ----------
     .. [1] Keogh, E. Exact indexing of dynamic time warping. In International
@@ -1764,9 +1548,7 @@ def njit_lb_envelope(time_series, radius):
 
 def lb_envelope(ts, radius=1):
     r"""Compute time-series envelope as required by LB_Keogh.
-
     LB_Keogh was originally presented in [1]_.
-
     Parameters
     ----------
     ts : array-like
@@ -1776,14 +1558,12 @@ def lb_envelope(ts, radius=1):
         index i will be generated based on
         all observations from the time series at indices comprised between
         i-radius and i+radius).
-
     Returns
     -------
     array-like
         Lower-side of the envelope.
     array-like
         Upper-side of the envelope.
-
     Examples
     --------
     >>> ts1 = [1, 2, 3, 2, 1]
@@ -1800,11 +1580,9 @@ def lb_envelope(ts, radius=1):
            [3.],
            [3.],
            [2.]])
-
     See also
     --------
     lb_keogh : Compute LB_Keogh similarity
-
     References
     ----------
     .. [1] Keogh, E. Exact indexing of dynamic time warping. In International
@@ -1815,25 +1593,18 @@ def lb_envelope(ts, radius=1):
 
 def soft_dtw(ts1, ts2, gamma=1.):
     r"""Compute Soft-DTW metric between two time series.
-
     Soft-DTW was originally presented in [1]_ and is
     discussed in more details in our
     :ref:`user-guide page on DTW and its variants<dtw>`.
-
     Soft-DTW is computed as:
-
     .. math::
-
         \text{soft-DTW}_{\gamma}(X, Y) =
             \min_{\pi}{}^\gamma \sum_{(i, j) \in \pi} \|X_i, Y_j\|^2
-
     where :math:`\min^\gamma` is the soft-min operator of parameter
     :math:`\gamma`.
-
     In the limit case :math:`\gamma = 0`, :math:`\min^\gamma` reduces to a
     hard-min operator and soft-DTW is defined as the square of the DTW
     similarity measure.
-
     Parameters
     ----------
     ts1
@@ -1842,12 +1613,10 @@ def soft_dtw(ts1, ts2, gamma=1.):
         Another time series
     gamma : float (default 1.)
         Gamma paraneter for Soft-DTW
-
     Returns
     -------
     float
         Similarity
-
     Examples
     --------
     >>> soft_dtw([1, 2, 2, 3],
@@ -1858,11 +1627,9 @@ def soft_dtw(ts1, ts2, gamma=1.):
     ...          [1., 2., 2.1, 3.2],
     ...          gamma=0.01)  # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
     0.089...
-
     See Also
     --------
     cdist_soft_dtw : Cross similarity matrix between time series datasets
-
     References
     ----------
     .. [1] M. Cuturi, M. Blondel "Soft-DTW: a Differentiable Loss Function for
@@ -1876,25 +1643,18 @@ def soft_dtw(ts1, ts2, gamma=1.):
 
 def cdist_soft_dtw(dataset1, dataset2=None, gamma=1.):
     r"""Compute cross-similarity matrix using Soft-DTW metric.
-
     Soft-DTW was originally presented in [1]_ and is
     discussed in more details in our
     :ref:`user-guide page on DTW and its variants<dtw>`.
-
     Soft-DTW is computed as:
-
     .. math::
-
         \text{soft-DTW}_{\gamma}(X, Y) =
             \min_{\pi}{}^\gamma \sum_{(i, j) \in \pi} \|X_i, Y_j\|^2
-
     where :math:`\min^\gamma` is the soft-min operator of parameter
     :math:`\gamma`.
-
     In the limit case :math:`\gamma = 0`, :math:`\min^\gamma` reduces to a
     hard-min operator and soft-DTW is defined as the square of the DTW
     similarity measure.
-
     Parameters
     ----------
     dataset1
@@ -1903,12 +1663,10 @@ def cdist_soft_dtw(dataset1, dataset2=None, gamma=1.):
         Another dataset of time series
     gamma : float (default 1.)
         Gamma paraneter for Soft-DTW
-
     Returns
     -------
     numpy.ndarray
         Cross-similarity matrix
-
     Examples
     --------
     >>> cdist_soft_dtw([[1, 2, 2, 3], [1., 2., 3., 4.]], gamma=.01)
@@ -1918,13 +1676,11 @@ def cdist_soft_dtw(dataset1, dataset2=None, gamma=1.):
     ...                [[1, 2, 2, 3], [1., 2., 3., 4.]], gamma=.01)
     array([[-0.01098612,  1.        ],
            [ 1.        ,  0.        ]])
-
     See Also
     --------
     soft_dtw : Compute Soft-DTW
     cdist_soft_dtw_normalized : Cross similarity matrix between time series
         datasets using a normalized version of Soft-DTW
-
     References
     ----------
     .. [1] M. Cuturi, M. Blondel "Soft-DTW: a Differentiable Loss Function for
@@ -1961,65 +1717,48 @@ def cdist_soft_dtw(dataset1, dataset2=None, gamma=1.):
 def cdist_soft_dtw_normalized(dataset1, dataset2=None, gamma=1.):
     r"""Compute cross-similarity matrix using a normalized version of the
     Soft-DTW metric.
-
     Soft-DTW was originally presented in [1]_ and is
     discussed in more details in our
     :ref:`user-guide page on DTW and its variants<dtw>`.
-
     Soft-DTW is computed as:
-
     .. math::
-
         \text{soft-DTW}_{\gamma}(X, Y) =
             \min_{\pi}{}^\gamma \sum_{(i, j) \in \pi} \|X_i, Y_j\|^2
-
     where :math:`\min^\gamma` is the soft-min operator of parameter
     :math:`\gamma`.
-
     In the limit case :math:`\gamma = 0`, :math:`\min^\gamma` reduces to a
     hard-min operator and soft-DTW is defined as the square of the DTW
     similarity measure.
-
     This normalized version is defined as:
-
     .. math::
-
         \text{norm-soft-DTW}_{\gamma}(X, Y) =
             \text{soft-DTW}_{\gamma}(X, Y) -
             \frac{1}{2} \left(\text{soft-DTW}_{\gamma}(X, X) +
                 \text{soft-DTW}_{\gamma}(Y, Y)\right)
-
     and ensures that all returned values are positive and that
     :math:`\text{norm-soft-DTW}_{\gamma}(X, X) = 0`.
-
     Parameters
     ----------
     dataset1
         A dataset of time series
-
     dataset2
         Another dataset of time series
-
     gamma : float (default 1.)
         Gamma paraneter for Soft-DTW
-
     Returns
     -------
     numpy.ndarray
         Cross-similarity matrix
-
     Examples
     --------
     >>> time_series = numpy.random.randn(10, 15, 1)
     >>> numpy.alltrue(cdist_soft_dtw_normalized(time_series) >= 0.)
     True
-
     See Also
     --------
     soft_dtw : Compute Soft-DTW
     cdist_soft_dtw : Cross similarity matrix between time series
         datasets using the unnormalized version of Soft-DTW
-
     References
     ----------
     .. [1] M. Cuturi, M. Blondel "Soft-DTW: a Differentiable Loss Function for
@@ -2039,7 +1778,6 @@ def __init__(self, D, gamma=1.):
         gamma: float
             Regularization parameter.
             Lower is less smoothed (closer to true DTW).
-
         Attributes
         ----------
         self.R_: array, shape = [m + 2, n + 2]
@@ -2062,7 +1800,6 @@ def __init__(self, D, gamma=1.):
 
     def compute(self):
         """Compute soft-DTW by dynamic programming.
-
         Returns
         -------
         sdtw: float
@@ -2078,7 +1815,6 @@ def compute(self):
 
     def grad(self):
         """Compute gradient of soft-DTW w.r.t. D by dynamic programming.
-
         Returns
         -------
         grad: array, shape = [m, n]
@@ -2113,7 +1849,6 @@ def __init__(self, X, Y):
             First time series.
         Y: array, shape = [n, d]
             Second time series.
-
         Examples
         --------
         >>> SquaredEuclidean([1, 2, 2, 3], [1, 2, 3, 4]).compute()
@@ -2127,7 +1862,6 @@ def __init__(self, X, Y):
 
     def compute(self):
         """Compute distance matrix.
-
         Returns
         -------
         D: array, shape = [m, n]
@@ -2138,12 +1872,10 @@ def compute(self):
     def jacobian_product(self, E):
         """Compute the product between the Jacobian
         (a linear map from m x d to m x n) and a matrix E.
-
         Parameters
         ----------
         E: array, shape = [m, n]
             Second time series.
-
         Returns
         -------
         G: array, shape = [m, d]
@@ -2154,4 +1886,4 @@ def jacobian_product(self, E):
 
         _jacobian_product_sq_euc(self.X, self.Y, E.astype(numpy.float64), G)
 
-        return G
+        return G
\ No newline at end of file

From f7bb66b9263d3cb4c4e96f338a7caa838b2daa2b Mon Sep 17 00:00:00 2001
From: enricofallacara <enrico.fallacara@gmail.com>
Date: Wed, 16 Dec 2020 10:58:48 +0100
Subject: [PATCH 2/3] fixed number of threads parameter

---
 tslearn/barycenters.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tslearn/barycenters.py b/tslearn/barycenters.py
index 162ae9f14..b92db80ee 100644
--- a/tslearn/barycenters.py
+++ b/tslearn/barycenters.py
@@ -20,7 +20,6 @@
 import warnings
 from sklearn.utils import check_random_state
 from joblib import parallel_backend, Parallel, delayed
-import psutil
 
 from tslearn.utils import to_time_series_dataset, check_equal_size, \
     to_time_series, ts_size
@@ -227,7 +226,7 @@ def dtw_barycenter_averaging_petitjean(X, barycenter_size=None,
     return barycenter
 
 
-def _mm_assignment(X, barycenter, weights, metric_params=None):
+def _mm_assignment(X, barycenter, weights, metric_params=None, n_treads=15):
     """Computes item assignement based on DTW alignments and return cost as a
     bonus.
     Parameters
@@ -256,7 +255,7 @@ def _mm_assignment(X, barycenter, weights, metric_params=None):
     cost = 0.
     list_p_k = []
     #with parallel_backend('threading'):
-    res = Parallel(backend='threading',prefer="threads",require='sharedmem',n_jobs=15,verbose=10)(delayed(dtw_path)(barycenter, X[i], global_constraint="sakoe_chiba",sakoe_chiba_radius=1) for i in range(n))
+    res = Parallel(backend='threading',prefer="threads",require='sharedmem',n_jobs=n_treads,verbose=10)(delayed(dtw_path)(barycenter, X[i], global_constraint="sakoe_chiba",sakoe_chiba_radius=1) for i in range(n))
     paths, dists = zip(*res)
     paths = list(paths)
     dists = list(dists)
@@ -271,7 +270,7 @@ def _mm_assignment(X, barycenter, weights, metric_params=None):
     return list_p_k, cost
 
 
-def _subgradient_valence_warping(list_p_k, barycenter_size, weights):
+def _subgradient_valence_warping(list_p_k, barycenter_size, weights, n_treads):
     """Compute Valence and Warping matrices from paths.
     Valence matrices are denoted :math:`V^{(k)}` and Warping matrices are
     :math:`W^{(k)}` in [1]_.
@@ -302,17 +301,17 @@ def create_w_k(p_k, barycenter_size):
         for i, j in p_k:
             w_k[i, j] = 1.
         return w_k			
-    w_k_ones = Parallel(backend='threading',prefer='threads',require='sharedmem', n_jobs=15, verbose = True)(delayed(create_w_k)(p_k,barycenter_size) for p_k in list_p_k)
+    w_k_ones = Parallel(backend='threading',prefer='threads',require='sharedmem', n_jobs=n_treads, verbose = True)(delayed(create_w_k)(p_k,barycenter_size) for p_k in list_p_k)
     def mult_w_k_weights(w_k, weight):
         return w_k * weight
-    list_w_k = Parallel(backend='threading',prefer='threads',require='sharedmem', n_jobs=15, verbose = True)(delayed(mult_w_k_weights)(w_k,weights[k]) for k,w_k in enumerate(w_k_ones))
+    list_w_k = Parallel(backend='threading',prefer='threads',require='sharedmem', n_jobs=n_treads, verbose = True)(delayed(mult_w_k_weights)(w_k,weights[k]) for k,w_k in enumerate(w_k_ones))
     def mult_w_k_sum_wieghts(w_k, weight):
         return w_k.sum(axis=1) * weight
-    list_v_k = Parallel(backend='threading',prefer='threads',require='sharedmem', n_jobs=15, verbose = True)(delayed(mult_w_k_sum_wieghts)(w_k,weight) for (w_k, weight) in zip(w_k_ones, weights))
+    list_v_k = Parallel(backend='threading',prefer='threads',require='sharedmem', n_jobs=n_treads, verbose = True)(delayed(mult_w_k_sum_wieghts)(w_k,weight) for (w_k, weight) in zip(w_k_ones, weights))
     return list_v_k, list_w_k
 
 
-def _mm_valence_warping(list_p_k, barycenter_size, weights):
+def _mm_valence_warping(list_p_k, barycenter_size, weights, n_treads):
     """Compute Valence and Warping matrices from paths.
     Valence matrices are denoted :math:`V^{(k)}` and Warping matrices are
     :math:`W^{(k)}` in [1]_.
@@ -345,12 +344,12 @@ def _mm_valence_warping(list_p_k, barycenter_size, weights):
     #diag_sum_v_k = numpy.zeros(list_v_k[0].shape)
     #for v_k in list_v_k:
     #    diag_sum_v_k += v_k
-    diag_sum_v_k = Parallel(backend='threading',prefer='threads',require='sharedmem', n_jobs=15, verbose = True)(delayed(numpy.sum)(x) for x in zip(*numpy.array(list_v_k)))
+    diag_sum_v_k = Parallel(backend='threading',prefer='threads',require='sharedmem', n_jobs=n_treads, verbose = True)(delayed(numpy.sum)(x) for x in zip(*numpy.array(list_v_k)))
     diag_sum_v_k = numpy.array(diag_sum_v_k)
     return diag_sum_v_k, list_w_k
 
 
-def _mm_update_barycenter(X, diag_sum_v_k, list_w_k):
+def _mm_update_barycenter(X, diag_sum_v_k, list_w_k, n_treads):
     """Update barycenters using the formula from Algorithm 2 in [1]_.
     Parameters
     ----------
@@ -375,8 +374,8 @@ def _mm_update_barycenter(X, diag_sum_v_k, list_w_k):
     #sum_w_x = numpy.zeros((barycenter_size, d))
     #for k, (w_k, x_k) in enumerate(zip(list_w_k, X)):
     #    sum_w_x += w_k.dot(x_k[:ts_size(x_k)])
-    sum_w_x = Parallel(backend='threading',prefer='threads',require='sharedmem', n_jobs=15, verbose = True)(delayed(numpy.dot)(w_k,x_k[:ts_size(x_k)]) for (w_k, x_k) in zip(list_w_k, X))
-    sum_w_x = Parallel(backend='threading',prefer='threads',require='sharedmem',n_jobs=15, verbose = True)(delayed(numpy.sum)(x) for x in zip(*numpy.array(sum_w_x)))
+    sum_w_x = Parallel(backend='threading',prefer='threads',require='sharedmem', n_jobs=n_treads, verbose = True)(delayed(numpy.dot)(w_k,x_k[:ts_size(x_k)]) for (w_k, x_k) in zip(list_w_k, X))
+    sum_w_x = Parallel(backend='threading',prefer='threads',require='sharedmem',n_jobs=n_treads, verbose = True)(delayed(numpy.sum)(x) for x in zip(*numpy.array(sum_w_x)))
     sum_w_x = numpy.array(sum_w_x).reshape((len(sum_w_x), 1))
     inverse_diag_sum_v_k = numpy.reciprocal(diag_sum_v_k)
     inverse_diag_sum_v_k = numpy.diag(inverse_diag_sum_v_k)
@@ -422,7 +421,7 @@ def _subgradient_update_barycenter(X, list_diag_v_k, list_w_k, weights_sum,
 
 
 def dtw_barycenter_averaging(X, barycenter_size=None, init_barycenter=None,
-                             max_iter=30, tol=1e-5, weights=None,
+                             max_iter=30, tol=1e-5, weights=None, n_treads=15,
                              metric_params=None,
                              verbose=False, n_init=1):
     """DTW Barycenter Averaging (DBA) method estimated through
@@ -530,6 +529,7 @@ def dtw_barycenter_averaging(X, barycenter_size=None, init_barycenter=None,
 def dtw_barycenter_averaging_one_init(X, barycenter_size=None,
                                       init_barycenter=None,
                                       max_iter=30, tol=1e-5, weights=None,
+									  n_treads=15,
                                       metric_params=None,
                                       verbose=False):
     """DTW Barycenter Averaging (DBA) method estimated through
@@ -591,12 +591,12 @@ def dtw_barycenter_averaging_one_init(X, barycenter_size=None,
         barycenter = init_barycenter
     cost_prev, cost = numpy.inf, numpy.inf
     for it in range(max_iter):
-        list_p_k, cost = _mm_assignment(X, barycenter, weights, metric_params)
+        list_p_k, cost = _mm_assignment(X, barycenter, weights, n_treads, metric_params)
         diag_sum_v_k, list_w_k = _mm_valence_warping(list_p_k, barycenter_size,
-                                                     weights)
+                                                     weights, n_treads)
         if verbose:
             print("[DBA] epoch %d, cost: %.3f" % (it + 1, cost))
-        barycenter = _mm_update_barycenter(X, diag_sum_v_k, list_w_k)
+        barycenter = _mm_update_barycenter(X, diag_sum_v_k, list_w_k, n_treads)
         if abs(cost_prev - cost) < tol:
             break
         elif cost_prev < cost:

From 6e2ce6b1515683331e376f48f57ba62e5b41ce6d Mon Sep 17 00:00:00 2001
From: Romain Tavenard <romain.tavenard@univ-rennes2.fr>
Date: Tue, 5 Jan 2021 12:09:05 +0100
Subject: [PATCH 3/3] Merge branch 'master' into parallel_time_series_k_means

# Conflicts:
#	tslearn/barycenters/dba.py
#	tslearn/clustering/kmeans.py
#	tslearn/metrics/dtw_variants.py

Co-authored-by: enricofallacara <enrico.fallacara@gmail.com>
---
 tslearn/metrics/utils.py | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/tslearn/metrics/utils.py b/tslearn/metrics/utils.py
index ece61c435..630d68be6 100644
--- a/tslearn/metrics/utils.py
+++ b/tslearn/metrics/utils.py
@@ -1,5 +1,5 @@
 import numpy
-from joblib import Parallel, delayed
+from joblib import Parallel, delayed, parallel_backend
 from tslearn.utils import to_time_series_dataset
 
 __author__ = 'Romain Tavenard romain.tavenard[at]univ-rennes2.fr'
@@ -9,26 +9,21 @@ def _cdist_generic(dist_fun, dataset1, dataset2, n_jobs, verbose,
                    compute_diagonal=True, dtype=numpy.float, *args, **kwargs):
     """Compute cross-similarity matrix with joblib parallelization for a given
     similarity function.
-
     Parameters
     ----------
     dist_fun : function
         Similarity function to be used
-
     dataset1 : array-like
         A dataset of time series
-
     dataset2 : array-like (default: None)
         Another dataset of time series. If `None`, self-similarity of
         `dataset1` is returned.
-
     n_jobs : int or None, optional (default=None)
         The number of jobs to run in parallel.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See scikit-learns'
         `Glossary <https://scikit-learn.org/stable/glossary.html#term-n-jobs>`__
         for more details.
-
     verbose : int, optional (default=0)
         The verbosity level: if non zero, progress messages are printed.
         Above 50, the output is sent to stdout.
@@ -36,21 +31,17 @@ def _cdist_generic(dist_fun, dataset1, dataset2, n_jobs, verbose,
         If it more than 10, all iterations are reported.
         `Glossary <https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation>`__
         for more details.
-
     compute_diagonal : bool (default: True)
         Whether diagonal terms should be computed or assumed to be 0 in the
         self-similarity case. Used only if `dataset2` is `None`.
-
     *args and **kwargs :
         Optional additional parameters to be passed to the similarity function.
-
-
     Returns
     -------
     cdist : numpy.ndarray
         Cross-similarity matrix
     """  # noqa: E501
-    dataset1 = to_time_series_dataset(dataset1, dtype=dtype)
+    #dataset1 = to_time_series_dataset(dataset1, dtype=dtype)
 
     if dataset2 is None:
         # Inspired from code by @GillesVandewiele:
@@ -60,7 +51,7 @@ def _cdist_generic(dist_fun, dataset1, dataset2, n_jobs, verbose,
                                      k=0 if compute_diagonal else 1,
                                      m=len(dataset1))
         matrix[indices] = Parallel(n_jobs=n_jobs,
-                                   prefer="threads",
+                                   prefer="processes",
                                    verbose=verbose)(
             delayed(dist_fun)(
                 dataset1[i], dataset1[j],
@@ -74,12 +65,13 @@ def _cdist_generic(dist_fun, dataset1, dataset2, n_jobs, verbose,
         matrix[indices] = matrix.T[indices]
         return matrix
     else:
-        dataset2 = to_time_series_dataset(dataset2, dtype=dtype)
-        matrix = Parallel(n_jobs=n_jobs, prefer="threads", verbose=verbose)(
-            delayed(dist_fun)(
-                dataset1[i], dataset2[j],
-                *args, **kwargs
+        #dataset2 = to_time_series_dataset(dataset2, dtype=dtype)
+        with parallel_backend('loky'):
+            matrix = Parallel(backend='loky', n_jobs=n_jobs, prefer="processes", verbose=verbose)(
+                delayed(dist_fun)(
+                    dataset1[i], dataset2[j],
+                    *args, **kwargs
+                )
+                for i in range(len(dataset1)) for j in range(len(dataset2))
             )
-            for i in range(len(dataset1)) for j in range(len(dataset2))
-        )
         return numpy.array(matrix).reshape((len(dataset1), -1))