From 9368a9cde16b96bf1f2459a4024fdb5fb31111fe Mon Sep 17 00:00:00 2001 From: chrisholder Date: Wed, 19 Nov 2025 04:34:35 +0000 Subject: [PATCH 1/8] added new _cluster_initilisation and utility functions to be used in clustering module --- aeon/clustering/_cluster_initialisation.py | 347 +++++++++++++++++++++ aeon/clustering/_elastic_som.py | 70 ++--- aeon/clustering/_k_means.py | 69 +--- aeon/clustering/_k_medoids.py | 69 ++-- aeon/clustering/_kasba.py | 41 +-- aeon/clustering/tests/test_clarans.py | 45 --- aeon/clustering/tests/test_k_means.py | 52 ++- aeon/clustering/tests/test_k_medoids.py | 48 ++- 8 files changed, 500 insertions(+), 241 deletions(-) create mode 100644 aeon/clustering/_cluster_initialisation.py diff --git a/aeon/clustering/_cluster_initialisation.py b/aeon/clustering/_cluster_initialisation.py new file mode 100644 index 0000000000..c93c54aef3 --- /dev/null +++ b/aeon/clustering/_cluster_initialisation.py @@ -0,0 +1,347 @@ +from collections.abc import Callable + +import numpy as np +from numpy.random import RandomState + +from aeon.distances import pairwise_distance + + +def _random_center_initialiser_indexes( + *, X: np.ndarray, n_clusters: int, random_state: RandomState +) -> np.ndarray: + return random_state.choice(X.shape[0], n_clusters, replace=False) + + +def _random_center_initialiser( + *, X: np.ndarray, n_clusters: int, random_state: RandomState +) -> np.ndarray: + return X[ + _random_center_initialiser_indexes( + X=X, n_clusters=n_clusters, random_state=random_state + ) + ] + + +def _first_center_initialiser_indexes( + *, X: np.ndarray, n_clusters: int, random_state: RandomState, **kwargs +) -> np.ndarray: + return np.arange(n_clusters) + + +def _first_center_initialiser( + *, X: np.ndarray, n_clusters: int, random_state: RandomState +) -> np.ndarray: + return X[ + _first_center_initialiser_indexes( + X=X, n_clusters=n_clusters, random_state=random_state + ) + ] + + +def _random_values_center_initialiser( + *, X: np.ndarray, n_clusters: int, random_state: RandomState +): + return random_state.rand(n_clusters, X.shape[1]) + + +def _kmeans_plus_plus_center_initialiser_indexes( + *, + X: np.ndarray, + n_clusters: int, + random_state: RandomState, + distance: str | Callable, + distance_params: dict, + n_jobs: int, + return_distance_and_labels: bool = False, + **kwargs, +) -> np.ndarray: + n_samples = X.shape[0] + initial_center_idx = random_state.randint(n_samples) + indexes = [initial_center_idx] + + min_distances = pairwise_distance( + X, + X[[initial_center_idx]], + method=distance, + n_jobs=n_jobs, + **distance_params, + ).reshape(n_samples) + + labels = np.zeros(n_samples, dtype=int) + + for i in range(1, n_clusters): + d = min_distances.copy() + chosen = np.asarray(indexes, dtype=int) + finite_mask = np.isfinite(d) + if not np.any(finite_mask): + candidates = np.setdiff1d(np.arange(n_samples), chosen, assume_unique=False) + next_center_idx = random_state.choice(candidates) + indexes.append(next_center_idx) + + new_distances = pairwise_distance( + X, + X[[next_center_idx]], + method=distance, + n_jobs=n_jobs, + **distance_params, + ).reshape(n_samples) + + closer_points = new_distances < min_distances + min_distances[closer_points] = new_distances[closer_points] + labels[closer_points] = i + continue + + min_val = d[finite_mask].min() + w = d - min_val + w[~np.isfinite(w)] = 0.0 + w = np.clip(w, 0.0, None) + w[chosen] = 0.0 + + total = w.sum() + if total <= 0.0: + candidates = np.setdiff1d(np.arange(n_samples), chosen, assume_unique=False) + next_center_idx = random_state.choice(candidates) + else: + p = w / total + p = np.clip(p, 0.0, None) + p_sum = p.sum() + if p_sum <= 0.0: + candidates = np.setdiff1d( + np.arange(n_samples), chosen, assume_unique=False + ) + next_center_idx = random_state.choice(candidates) + else: + p = p / p_sum + next_center_idx = random_state.choice(n_samples, p=p) + + indexes.append(next_center_idx) + + new_distances = pairwise_distance( + X, + X[[next_center_idx]], + method=distance, + n_jobs=n_jobs, + **distance_params, + ).reshape(n_samples) + + closer_points = new_distances < min_distances + min_distances[closer_points] = new_distances[closer_points] + labels[closer_points] = i + + if return_distance_and_labels: + return np.array(indexes), labels, min_distances + else: + return np.array(indexes) + + +def _kmeans_plus_plus_center_initialiser( + *, + X: np.ndarray, + n_clusters: int, + random_state: RandomState, + distance: str | Callable, + distance_params: dict, + n_jobs: int, + return_distance_and_labels: bool = False, + **kwargs, +) -> np.ndarray: + indexes, labels, min_distances = _kmeans_plus_plus_center_initialiser_indexes( + X=X, + n_clusters=n_clusters, + random_state=random_state, + distance=distance, + distance_params=distance_params, + n_jobs=n_jobs, + return_distance_and_labels=True, + ) + if return_distance_and_labels: + return X[indexes], labels, min_distances + return X[indexes] + + +def _kmedoids_plus_plus_center_initialiser_indexes( + *, + X: np.ndarray, + n_clusters: int, + random_state: RandomState, + distance: str | Callable, + distance_params: dict, + n_jobs: int = 1, + **kwargs, +) -> np.ndarray: + """K-medoids++ initialisation that returns indexes. + + This is a simpler variant of kmeans++ that uses minimum distances + directly as probabilities without the sophisticated weighting scheme. + """ + initial_center_idx = random_state.randint(X.shape[0]) + indexes = [initial_center_idx] + + for _ in range(1, n_clusters): + pw_dist = pairwise_distance( + X, X[indexes], method=distance, n_jobs=n_jobs, **distance_params + ) + min_distances = pw_dist.min(axis=1) + probabilities = min_distances / min_distances.sum() + next_center_idx = random_state.choice(X.shape[0], p=probabilities) + indexes.append(next_center_idx) + + return np.array(indexes) + + +def _kmedoids_plus_plus_center_initialiser( + *, + X: np.ndarray, + n_clusters: int, + random_state: RandomState, + distance: str | Callable, + distance_params: dict, + n_jobs: int = 1, + **kwargs, +) -> np.ndarray: + """K-medoids++ initialisation that returns centers.""" + indexes = _kmedoids_plus_plus_center_initialiser_indexes( + X=X, + n_clusters=n_clusters, + random_state=random_state, + distance=distance, + distance_params=distance_params, + n_jobs=n_jobs, + ) + return X[indexes] + + +def resolve_center_initialiser( + init: str | np.ndarray, + X: np.ndarray, + n_clusters: int, + random_state: RandomState, + initialisers_dict: dict, + distance: str | Callable | None = None, + distance_params: dict | None = None, + n_jobs: int = 1, + custom_init_handlers: dict | None = None, + use_indexes: bool = False, +) -> Callable | np.ndarray: + """Resolve the center initialiser function or array from init parameter. + + Parameters + ---------- + init : str or np.ndarray + Initialisation method string or array of initial centers/indexes. + X : np.ndarray + Input data for validation. + n_clusters : int + Number of clusters. + random_state : RandomState + Random state for initialisation. + initialisers_dict : dict + Dictionary of available initialisers (CENTER_INITIALISERS or + CENTER_INITIALISER_INDEXES). + distance : str or Callable, optional + Distance method (required for kmeans++/kmedoids++). + distance_params : dict, optional + Distance parameters (required for kmeans++/kmedoids++). + n_jobs : int, default=1 + Number of jobs for parallel processing (used for kmeans++/kmedoids++). + custom_init_handlers : dict, optional + Dictionary of custom initialisation handlers for special cases (e.g., + {"build": handler}). + use_indexes : bool, default=False + If True, expects 1D arrays (indexes). If False, expects multi-dimensional + arrays (centers). + + Returns + ------- + Callable or np.ndarray + Initialisation function or array. + """ + valid_init_methods = ", ".join(sorted(initialisers_dict.keys())) + + if isinstance(init, str): + # Check custom handlers first (e.g., "build" for k-medoids) + if custom_init_handlers and init in custom_init_handlers: + return custom_init_handlers[init] + + if init not in initialisers_dict: + raise ValueError( + f"The value provided for init: {init} is " + f"invalid. The following are a list of valid init algorithms " + f"strings: {valid_init_methods}. You can also pass a " + f"np.ndarray of appropriate shape." + ) + + initialiser_func = initialisers_dict[init] + if init in ("kmeans++", "kmedoids++"): + # kmeans++ and kmedoids++ need additional parameters + if distance is None or distance_params is None: + raise ValueError( + f"distance and distance_params are required for {init} " + f"initialisation" + ) + return lambda X: initialiser_func( + X=X, + n_clusters=n_clusters, + random_state=random_state, + distance=distance, + distance_params=distance_params, + n_jobs=n_jobs, + ) + else: + # random, first, random_values only need basic parameters + return lambda X: initialiser_func( + X=X, + n_clusters=n_clusters, + random_state=random_state, + ) + else: + if isinstance(init, np.ndarray): + if len(init) != n_clusters: + raise ValueError( + f"The value provided for init: {init} is " + f"invalid. Expected length {n_clusters}, got {len(init)}." + ) + + if use_indexes: + if init.ndim != 1: + raise ValueError( + f"The value provided for init: {init} is " + f"invalid. Expected 1D array of shape ({n_clusters},), " + f"got {init.shape}." + ) + return init + else: + if init.ndim == 1: + raise ValueError( + f"The value provided for init: {init} is " + f"invalid. Expected multi-dimensional array of shape " + f"({n_clusters}, {X.shape[1]}, {X.shape[2]}), got {init.shape}." + ) + if init.shape[1:] != X.shape[1:]: + raise ValueError( + f"The value provided for init: {init} is " + f"invalid. Expected shape ({n_clusters}, {X.shape[1]}, " + f"{X.shape[2]}), got {init.shape}." + ) + return init.copy() + else: + raise ValueError( + f"The value provided for init: {init} is " + f"invalid. Expected a string or np.ndarray." + ) + + +CENTER_INITIALISERS = { + "random": _random_center_initialiser, + "first": _first_center_initialiser, + "random_values": _random_values_center_initialiser, + "kmeans++": _kmeans_plus_plus_center_initialiser, + "kmedoids++": _kmedoids_plus_plus_center_initialiser, +} + +CENTER_INITIALISER_INDEXES = { + "random": _random_center_initialiser_indexes, + "first": _first_center_initialiser_indexes, + "kmeans++": _kmeans_plus_plus_center_initialiser_indexes, + "kmedoids++": _kmedoids_plus_plus_center_initialiser_indexes, +} diff --git a/aeon/clustering/_elastic_som.py b/aeon/clustering/_elastic_som.py index 361979a6d8..35c017bc12 100644 --- a/aeon/clustering/_elastic_som.py +++ b/aeon/clustering/_elastic_som.py @@ -9,6 +9,10 @@ from numpy.random import RandomState from sklearn.utils.random import check_random_state +from aeon.clustering._cluster_initialisation import ( + CENTER_INITIALISERS, + resolve_center_initialiser, +) from aeon.clustering.base import BaseClusterer from aeon.distances import get_alignment_path_function, pairwise_distance @@ -257,30 +261,23 @@ def _update_iteration(self, x, weights, decay_rate, num_iterations): def _check_params(self, X): self._random_state = check_random_state(self.random_state) - # random initialization - if isinstance(self.init, str): - if self.init == "random": - self._init = self._random_center_initializer - elif self.init == "kmeans++": - self._init = self._kmeans_plus_plus_center_initializer - elif self.init == "first": - self._init = self._first_center_initializer - else: - raise ValueError( - f"The value provided for init: {self.init} is " - f"invalid. The following are a list of valid init algorithms " - f"strings: random, kmedoids++, first" - ) + + if self.distance_params is None: + self._distance_params = {} else: - if isinstance(self.init, np.ndarray) and len(self.init) == self.n_clusters: - self._init = self.init.copy() - else: - raise ValueError( - f"The value provided for init: {self.init} is " - f"invalid. The following are a list of valid init algorithms " - f"strings: random, kmedoids++, first. You can also pass a" - f"np.ndarray of size (n_clusters, n_channels, n_timepoints)" - ) + self._distance_params = self.distance_params + + self._init = resolve_center_initialiser( + init=self.init, + X=X, + n_clusters=self.n_clusters, + random_state=self._random_state, + initialisers_dict=CENTER_INITIALISERS, + distance=self.distance, + distance_params=self._distance_params, + n_jobs=1, + use_indexes=False, + ) self._neuron_position = np.arange(self.n_clusters) @@ -331,11 +328,6 @@ def _check_params(self, X): else: self._alignment_path_callable = None - if self.distance_params is None: - self._distance_params = {} - else: - self._distance_params = self.distance_params - def _elastic_update(self, x, y, w): best_path, distance = self._alignment_path_callable( x, y, **self._distance_params @@ -362,28 +354,6 @@ def _elastic_update(self, x, y, w): return s3 - def _random_center_initializer(self, X: np.ndarray) -> np.ndarray: - return X[self._random_state.choice(X.shape[0], self.n_clusters, replace=False)] - - def _kmeans_plus_plus_center_initializer(self, X: np.ndarray): - initial_center_idx = self._random_state.randint(X.shape[0]) - indexes = [initial_center_idx] - - for _ in range(1, self.n_clusters): - pw_dist = pairwise_distance( - X, X[indexes], method=self.distance, **self._distance_params - ) - min_distances = pw_dist.min(axis=1) - probabilities = min_distances / min_distances.sum() - next_center_idx = self._random_state.choice(X.shape[0], p=probabilities) - indexes.append(next_center_idx) - - centers = X[indexes] - return centers - - def _first_center_initializer(self, X: np.ndarray) -> np.ndarray: - return X[list(range(self.n_clusters))] - @classmethod def _get_test_params(cls, parameter_set="default"): """Return testing parameter settings for the estimator. diff --git a/aeon/clustering/_k_means.py b/aeon/clustering/_k_means.py index 54ce0d7b95..2feb6c54ec 100644 --- a/aeon/clustering/_k_means.py +++ b/aeon/clustering/_k_means.py @@ -9,6 +9,10 @@ from numpy.random import RandomState from sklearn.utils import check_random_state +from aeon.clustering._cluster_initialisation import ( + CENTER_INITIALISERS, + resolve_center_initialiser, +) from aeon.clustering.averaging import ( VALID_BA_DISTANCE_METHODS, elastic_barycenter_average, @@ -299,36 +303,23 @@ def _check_params(self, X: np.ndarray) -> None: self._random_state = check_random_state(self.random_state) self._n_jobs = check_n_jobs(self.n_jobs) - _incorrect_init_str = ( - f"The value provided for init: {self.init} is " - f"invalid. The following are a list of valid init algorithms " - f"strings: random, kmeans++, first. You can also pass a " - f"np.ndarray of size (n_clusters, n_channels, n_timepoints)" - ) - - if isinstance(self.init, str): - if self.init == "random": - self._init = self._random_center_initializer - elif self.init == "kmeans++": - self._init = self._kmeans_plus_plus_center_initializer - elif self.init == "first": - self._init = self._first_center_initializer - else: - raise ValueError(_incorrect_init_str) - else: - if ( - isinstance(self.init, np.ndarray) - and len(self.init) == self.n_clusters - and self.init.shape[1:] == X.shape[1:] - ): - self._init = self.init.copy() - else: - raise ValueError(_incorrect_init_str) - + # Set up distance_params before init logic (needed for kmeans++ initializer) if self.distance_params is None: self._distance_params = {} else: self._distance_params = self.distance_params.copy() + + self._init = resolve_center_initialiser( + init=self.init, + X=X, + n_clusters=self.n_clusters, + random_state=self._random_state, + initialisers_dict=CENTER_INITIALISERS, + distance=self.distance, + distance_params=self._distance_params, + n_jobs=self._n_jobs, + use_indexes=False, + ) if self.average_params is None: self._average_params = {} else: @@ -375,32 +366,6 @@ def _check_params(self, X: np.ndarray) -> None: if isinstance(self.averaging_method, str) and self.averaging_method != "mean": self._average_params["n_jobs"] = self._n_jobs - def _random_center_initializer(self, X: np.ndarray) -> np.ndarray: - return X[self._random_state.choice(X.shape[0], self.n_clusters, replace=False)] - - def _first_center_initializer(self, X: np.ndarray) -> np.ndarray: - return X[list(range(self.n_clusters))] - - def _kmeans_plus_plus_center_initializer(self, X: np.ndarray): - initial_center_idx = self._random_state.randint(X.shape[0]) - indexes = [initial_center_idx] - - for _ in range(1, self.n_clusters): - pw_dist = pairwise_distance( - X, - X[indexes], - method=self.distance, - n_jobs=self._n_jobs, - **self._distance_params, - ) - min_distances = pw_dist.min(axis=1) - probabilities = min_distances / min_distances.sum() - next_center_idx = self._random_state.choice(X.shape[0], p=probabilities) - indexes.append(next_center_idx) - - centers = X[indexes] - return centers - def _handle_empty_cluster( self, X: np.ndarray, diff --git a/aeon/clustering/_k_medoids.py b/aeon/clustering/_k_medoids.py index 1b9d02618c..c431ba9faf 100644 --- a/aeon/clustering/_k_medoids.py +++ b/aeon/clustering/_k_medoids.py @@ -11,6 +11,10 @@ from sklearn.exceptions import ConvergenceWarning from sklearn.utils import check_random_state +from aeon.clustering._cluster_initialisation import ( + CENTER_INITIALISER_INDEXES, + resolve_center_initialiser, +) from aeon.clustering.base import BaseClusterer from aeon.distances import get_distance_function, pairwise_distance @@ -235,7 +239,6 @@ def _compute_new_cluster_centers( return np.array(new_center_indexes) def _compute_distance(self, X: np.ndarray, first_index: int, second_index: int): - # Check cache if np.isfinite(self._distance_cache[first_index, second_index]): return self._distance_cache[first_index, second_index] if np.isfinite(self._distance_cache[second_index, first_index]): @@ -243,7 +246,6 @@ def _compute_distance(self, X: np.ndarray, first_index: int, second_index: int): dist = self._distance_callable( X[first_index], X[second_index], **self._distance_params ) - # Update cache self._distance_cache[first_index, second_index] = dist self._distance_cache[second_index, first_index] = dist return dist @@ -282,7 +284,6 @@ def _pam_fit(self, X: np.ndarray): not_medoid_idxs = np.delete(np.arange(n_cases, dtype=int), medoids_idxs) for i in range(self.max_iter): - # Initialize best cost change and the associated swap couple. old_medoid_idxs = np.copy(medoids_idxs) best_cost_change = self._compute_optimal_swaps( distance_matrix, @@ -293,7 +294,6 @@ def _pam_fit(self, X: np.ndarray): ) inertia = np.inf - # If one of the swap decrease the objective, return that swap. if best_cost_change is not None and best_cost_change[2] < 0: first, second, _ = best_cost_change medoids_idxs[medoids_idxs == first] = second @@ -431,32 +431,23 @@ def _assign_clusters( def _check_params(self, X: np.ndarray) -> None: self._random_state = check_random_state(self.random_state) - _incorrect_init_str = ( - f"The value provided for init: {self.init} is " - f"invalid. The following are a list of valid init algorithms " - f"strings: random, kmedoids++, first, build. You can also pass a " - f"np.ndarray of size (n_clusters, n_channels, n_timepoints)" - ) - - if isinstance(self.init, str): - if self.init == "random": - self._init = self._random_center_initializer - elif self.init == "kmedoids++": - self._init = self._kmedoids_plus_plus_center_initializer - elif self.init == "first": - self._init = self._first_center_initializer - elif self.init == "build": - self._init = self._pam_build_center_initializer - else: - raise ValueError(_incorrect_init_str) - else: - if isinstance(self.init, np.ndarray) and len(self.init) == self.n_clusters: - self._init = self.init - else: - raise ValueError(_incorrect_init_str) - if self.distance_params is not None: self._distance_params = self.distance_params + else: + self._distance_params = {} + + self._init = resolve_center_initialiser( + init=self.init, + X=X, + n_clusters=self.n_clusters, + random_state=self._random_state, + initialisers_dict=CENTER_INITIALISER_INDEXES, + distance=self.distance, + distance_params=self._distance_params, + n_jobs=1, + custom_init_handlers={"build": self._pam_build_center_initializer}, + use_indexes=True, + ) if self.n_clusters > X.shape[0]: raise ValueError( @@ -481,28 +472,6 @@ def _check_params(self, X: np.ndarray) -> None: stacklevel=1, ) - def _random_center_initializer(self, X: np.ndarray) -> np.ndarray: - return self._random_state.choice(X.shape[0], self.n_clusters, replace=False) - - def _first_center_initializer(self, _) -> np.ndarray: - return np.array(list(range(self.n_clusters))) - - def _kmedoids_plus_plus_center_initializer(self, X: np.ndarray): - initial_center_idx = self._random_state.randint(X.shape[0]) - indexes = [initial_center_idx] - - for _ in range(1, self.n_clusters): - pw_dist = pairwise_distance( - X, X[indexes], method=self.distance, **self._distance_params - ) - min_distances = pw_dist.min(axis=1) - probabilities = min_distances / min_distances.sum() - next_center_idx = self._random_state.choice(X.shape[0], p=probabilities) - indexes.append(next_center_idx) - - centers = X[indexes] - return centers - def _pam_build_center_initializer( self, X: np.ndarray, diff --git a/aeon/clustering/_kasba.py b/aeon/clustering/_kasba.py index 63fdd53115..addbef5038 100644 --- a/aeon/clustering/_kasba.py +++ b/aeon/clustering/_kasba.py @@ -9,6 +9,7 @@ from numpy.random import RandomState from sklearn.utils import check_random_state +from aeon.clustering._cluster_initialisation import _kmeans_plus_plus_center_initialiser from aeon.clustering._k_means import EmptyClusterError from aeon.clustering.averaging import kasba_average from aeon.clustering.base import BaseClusterer @@ -142,8 +143,16 @@ def __init__( def _fit(self, X: np.ndarray, y=None): self._check_params(X) - cluster_centers, distances_to_centers, labels = self._elastic_kmeans_plus_plus( - X, + cluster_centers, labels, distances_to_centers = ( + _kmeans_plus_plus_center_initialiser( + X=X, + n_clusters=self.n_clusters, + random_state=self._random_state, + distance=self.distance, + distance_params=self._distance_params, + n_jobs=1, + return_distance_and_labels=True, + ) ) self.labels_, self.cluster_centers_, self.inertia_, self.n_iter_ = self._kasba( X, @@ -321,34 +330,6 @@ def _handle_empty_cluster( return labels, cluster_centers, distances_to_centers - def _elastic_kmeans_plus_plus( - self, - X, - ): - initial_center_idx = self._random_state.randint(X.shape[0]) - indexes = [initial_center_idx] - - min_distances = pairwise_distance( - X, X[initial_center_idx], method=self.distance, **self._distance_params - ).flatten() - labels = np.zeros(X.shape[0], dtype=int) - - for i in range(1, self.n_clusters): - probabilities = min_distances / min_distances.sum() - next_center_idx = self._random_state.choice(X.shape[0], p=probabilities) - indexes.append(next_center_idx) - - new_distances = pairwise_distance( - X, X[next_center_idx], method=self.distance, **self._distance_params - ).flatten() - - closer_points = new_distances < min_distances - min_distances[closer_points] = new_distances[closer_points] - labels[closer_points] = i - - centers = X[indexes] - return centers, min_distances, labels - def _check_params(self, X: np.ndarray) -> None: self._random_state = check_random_state(self.random_state) diff --git a/aeon/clustering/tests/test_clarans.py b/aeon/clustering/tests/test_clarans.py index a1da285cf3..d6968d6ccd 100644 --- a/aeon/clustering/tests/test_clarans.py +++ b/aeon/clustering/tests/test_clarans.py @@ -2,12 +2,9 @@ import numpy as np from sklearn import metrics -from sklearn.utils import check_random_state from aeon.clustering._clarans import TimeSeriesCLARANS -from aeon.clustering.tests.test_k_medoids import check_value_in_every_cluster from aeon.datasets import load_basic_motions, load_gunpoint -from aeon.distances import euclidean_distance def test_clarans_uni(): @@ -95,45 +92,3 @@ def test_clara_multi(): assert isinstance(clarans.cluster_centers_, np.ndarray) for val in proba: assert np.count_nonzero(val == 1.0) == 1 - - -def test_medoids_init(): - """Test init algorithms.""" - X_train, _ = load_gunpoint(split="train") - X_train = X_train[:10] - - num_clusters = 8 - kmedoids = TimeSeriesCLARANS( - random_state=1, - n_init=1, - init="first", - distance="euclidean", - n_clusters=num_clusters, - ) - - kmedoids._random_state = check_random_state(kmedoids.random_state) - kmedoids._distance_cache = np.full((len(X_train), len(X_train)), np.inf) - kmedoids._distance_callable = euclidean_distance - first_medoids_result = kmedoids._first_center_initializer(X_train) - check_value_in_every_cluster(num_clusters, first_medoids_result) - random_medoids_result = kmedoids._random_center_initializer(X_train) - check_value_in_every_cluster(num_clusters, random_medoids_result) - kmedoids_plus_plus_medoids_result = kmedoids._kmedoids_plus_plus_center_initializer( - X_train - ) - check_value_in_every_cluster(num_clusters, kmedoids_plus_plus_medoids_result) - kmedoids_build_result = kmedoids._pam_build_center_initializer(X_train) - check_value_in_every_cluster(num_clusters, kmedoids_build_result) - - # Test setting manual init centres - num_clusters = 8 - custom_init_centres = np.array([1, 2, 3, 4, 5, 6, 7, 8]) - kmedoids = TimeSeriesCLARANS( - random_state=1, - n_init=1, - init=custom_init_centres, - distance="euclidean", - n_clusters=num_clusters, - ) - kmedoids.fit(X_train) - assert np.array_equal(kmedoids.cluster_centers_, X_train[custom_init_centres]) diff --git a/aeon/clustering/tests/test_k_means.py b/aeon/clustering/tests/test_k_means.py index ace08a5530..3e0e902695 100644 --- a/aeon/clustering/tests/test_k_means.py +++ b/aeon/clustering/tests/test_k_means.py @@ -7,6 +7,7 @@ from sklearn import metrics from aeon.clustering import TimeSeriesKMeans +from aeon.clustering._cluster_initialisation import CENTER_INITIALISERS from aeon.datasets import load_basic_motions from aeon.distances._distance import ELASTIC_DISTANCES from aeon.testing.data_generation import make_example_3d_numpy @@ -169,18 +170,18 @@ def test_k_mean_distances(distance): "random_state": 1, "n_init": 1, "n_clusters": 3, - "init": "kmeans++", + "init": "random", "distance": dist, "distance_params": {key: params[key]}, } # Univariate test with_param_kmeans = _run_kmeans_test( - kmeans_params=curr_params, n_cases=40, n_channels=1, n_timepoints=10 + kmeans_params=curr_params, n_cases=80, n_channels=1, n_timepoints=10 ) # Multivariate test _run_kmeans_test( - kmeans_params=curr_params, n_cases=40, n_channels=3, n_timepoints=10 + kmeans_params=curr_params, n_cases=80, n_channels=3, n_timepoints=10 ) if dist in ELASTIC_DISTANCES: @@ -194,7 +195,7 @@ def test_k_mean_distances(distance): continue default_param_kmeans = _run_kmeans_test( - kmeans_params=curr_params, n_cases=40, n_channels=1, n_timepoints=10 + kmeans_params=curr_params, n_cases=80, n_channels=1, n_timepoints=10 ) # Test parameters passed through kmeans @@ -408,6 +409,49 @@ def test_empty_cluster(): kmeans.fit(np.array([first, first, first, first, first])) +def test_center_initialisers(): + """Test that CENTER_INITIALISERS work correctly.""" + from numpy.random import RandomState + + X_train = make_example_3d_numpy( + n_cases=20, n_channels=1, n_timepoints=10, random_state=1, return_y=False + ) + n_clusters = 3 + random_state = RandomState(1) + + # Test all available initializers + for init_name, initialiser_func in CENTER_INITIALISERS.items(): + if init_name == "kmeans++" or init_name == "kmedoids++": + # kmeans++ and kmedoids++ needs additional parameters + centers = initialiser_func( + X=X_train, + n_clusters=n_clusters, + random_state=random_state, + distance="euclidean", + distance_params={}, + n_jobs=1, + ) + else: + # Other initializers only need basic parameters + centers = initialiser_func( + X=X_train, + n_clusters=n_clusters, + random_state=random_state, + ) + + # Verify output shape + # random_values returns (n_clusters, n_channels) - it generates random values + # not based on the input data structure + if init_name == "random_values": + assert centers.shape == (n_clusters, X_train.shape[1]) + else: + assert centers.shape == (n_clusters, X_train.shape[1], X_train.shape[2]) + # Verify no duplicate centers + for i in range(n_clusters): + for j in range(i + 1, n_clusters): + assert not np.array_equal(centers[i], centers[j]) + + def test_invalid_params(): """Test invalid parameters for k-mean.""" uni_data = make_example_3d_numpy( diff --git a/aeon/clustering/tests/test_k_medoids.py b/aeon/clustering/tests/test_k_medoids.py index 0fea3ead19..789fd43048 100644 --- a/aeon/clustering/tests/test_k_medoids.py +++ b/aeon/clustering/tests/test_k_medoids.py @@ -2,11 +2,9 @@ import numpy as np from sklearn import metrics -from sklearn.utils import check_random_state from aeon.clustering._k_medoids import TimeSeriesKMedoids from aeon.datasets import load_basic_motions, load_gunpoint -from aeon.distances import euclidean_distance def test_kmedoids_uni(): @@ -165,6 +163,7 @@ def test_medoids_init(): X_train = X_train[:10] num_clusters = 8 + # Test first initializer kmedoids = TimeSeriesKMedoids( random_state=1, n_init=1, @@ -173,18 +172,47 @@ def test_medoids_init(): distance="euclidean", n_clusters=num_clusters, ) - kmedoids._random_state = check_random_state(kmedoids.random_state) - kmedoids._distance_cache = np.full((len(X_train), len(X_train)), np.inf) - kmedoids._distance_callable = euclidean_distance - first_medoids_result = kmedoids._first_center_initializer(X_train) + kmedoids._check_params(X_train) + first_medoids_result = kmedoids._init(X_train) check_value_in_every_cluster(num_clusters, first_medoids_result) - random_medoids_result = kmedoids._random_center_initializer(X_train) + + # Test random initializer + kmedoids = TimeSeriesKMedoids( + random_state=1, + n_init=1, + max_iter=5, + init="random", + distance="euclidean", + n_clusters=num_clusters, + ) + kmedoids._check_params(X_train) + random_medoids_result = kmedoids._init(X_train) check_value_in_every_cluster(num_clusters, random_medoids_result) - kmedoids_plus_plus_medoids_result = kmedoids._kmedoids_plus_plus_center_initializer( - X_train + + # Test kmedoids++ initializer + kmedoids = TimeSeriesKMedoids( + random_state=1, + n_init=1, + max_iter=5, + init="kmedoids++", + distance="euclidean", + n_clusters=num_clusters, ) + kmedoids._check_params(X_train) + kmedoids_plus_plus_medoids_result = kmedoids._init(X_train) check_value_in_every_cluster(num_clusters, kmedoids_plus_plus_medoids_result) - kmedoids_build_result = kmedoids._pam_build_center_initializer(X_train) + + # Test build initializer + kmedoids = TimeSeriesKMedoids( + random_state=1, + n_init=1, + max_iter=5, + init="build", + distance="euclidean", + n_clusters=num_clusters, + ) + kmedoids._check_params(X_train) + kmedoids_build_result = kmedoids._init(X_train) check_value_in_every_cluster(num_clusters, kmedoids_build_result) # Test setting manual init centres From 0c8459cc4100e223bed60999fcd771da8c9b694a Mon Sep 17 00:00:00 2001 From: chrisholder Date: Wed, 19 Nov 2025 18:51:52 +0000 Subject: [PATCH 2/8] added tests for init --- aeon/clustering/_cluster_initialisation.py | 6 +- .../tests/test_cluster_initialisation.py | 111 ++++++++++++++++++ 2 files changed, 114 insertions(+), 3 deletions(-) create mode 100644 aeon/clustering/tests/test_cluster_initialisation.py diff --git a/aeon/clustering/_cluster_initialisation.py b/aeon/clustering/_cluster_initialisation.py index c93c54aef3..0c38655bd3 100644 --- a/aeon/clustering/_cluster_initialisation.py +++ b/aeon/clustering/_cluster_initialisation.py @@ -41,7 +41,7 @@ def _first_center_initialiser( def _random_values_center_initialiser( *, X: np.ndarray, n_clusters: int, random_state: RandomState ): - return random_state.rand(n_clusters, X.shape[1]) + return random_state.rand(n_clusters, X.shape[-2], X.shape[-1]) def _kmeans_plus_plus_center_initialiser_indexes( @@ -51,7 +51,7 @@ def _kmeans_plus_plus_center_initialiser_indexes( random_state: RandomState, distance: str | Callable, distance_params: dict, - n_jobs: int, + n_jobs: int = 1, return_distance_and_labels: bool = False, **kwargs, ) -> np.ndarray: @@ -141,7 +141,7 @@ def _kmeans_plus_plus_center_initialiser( random_state: RandomState, distance: str | Callable, distance_params: dict, - n_jobs: int, + n_jobs: int = 1, return_distance_and_labels: bool = False, **kwargs, ) -> np.ndarray: diff --git a/aeon/clustering/tests/test_cluster_initialisation.py b/aeon/clustering/tests/test_cluster_initialisation.py new file mode 100644 index 0000000000..d69fe9e923 --- /dev/null +++ b/aeon/clustering/tests/test_cluster_initialisation.py @@ -0,0 +1,111 @@ +"""Tests for cluster initialisation functions.""" + +from collections.abc import Callable + +import numpy as np +import pytest +from numpy.random import RandomState + +from aeon.clustering._cluster_initialisation import ( + CENTER_INITIALISER_INDEXES, + CENTER_INITIALISERS, +) +from aeon.distances._distance import ELASTIC_DISTANCES, POINTWISE_DISTANCES +from aeon.testing.data_generation import make_example_3d_numpy + +NON_RANDOM_INIT = ["first"] + + +def _run_initialisation_test( + key: str, + init_func: Callable, + init_func_indexes: Callable = None, + init_func_params=None, +): + if init_func_params is None: + init_func_params = {} + + X = make_example_3d_numpy(10, 1, 10, random_state=1, return_y=False) + n_clusters = 3 + init_func_params = { + "X": X, + "n_clusters": n_clusters, + **init_func_params, + } + + values = init_func(**init_func_params, random_state=RandomState(1)) + + assert len(values) == n_clusters + assert values.shape[1:] == X.shape[1:] + + assert np.allclose( + values, init_func(**init_func_params, random_state=RandomState(1)) + ) + + if key not in NON_RANDOM_INIT: + diff_random_state_values = init_func( + **init_func_params, random_state=RandomState(2) + ) + assert not np.allclose(values, diff_random_state_values) + + if init_func_indexes: + indexes = init_func_indexes(**init_func_params, random_state=RandomState(1)) + value_from_indexes = X[indexes] + assert np.allclose(values, value_from_indexes) + + +@pytest.mark.parametrize("init_key", CENTER_INITIALISERS.keys()) +def test_center_initialisers(init_key): + """Test all center initialisers.""" + params = {} + if init_key == "kmeans++" or init_key == "kmedoids++": + params["distance"] = "euclidean" + params["distance_params"] = {} + + _run_initialisation_test( + key=init_key, + init_func=CENTER_INITIALISERS[init_key], + init_func_indexes=CENTER_INITIALISER_INDEXES.get(init_key, None), + init_func_params=params, + ) + + +@pytest.mark.parametrize("init_key", ["kmeans++", "kmedoids++"]) +@pytest.mark.parametrize("dist", POINTWISE_DISTANCES + ELASTIC_DISTANCES) +def test_distance_center_initialisers(init_key, dist): + """Test all center initialisers with distance.""" + params = { + "distance": dist, + "distance_params": {}, + } + _run_initialisation_test( + key=init_key, + init_func=CENTER_INITIALISERS[init_key], + init_func_indexes=CENTER_INITIALISER_INDEXES.get(init_key, None), + init_func_params=params, + ) + + +@pytest.mark.parametrize("init_key", ["kmeans++", "kmedoids++"]) +def test_distance_center_initialisers_params(init_key): + """Test all center initialisers with distance.""" + n_clusters = 3 + X = make_example_3d_numpy(50, 1, 10, random_state=1, return_y=False) + + init_func_no_window = CENTER_INITIALISERS[init_key]( + X=X, + n_clusters=n_clusters, + distance_params={}, + distance="soft_dtw", + random_state=RandomState(1), + ) + + init_func_window = CENTER_INITIALISERS[init_key]( + X=X, + n_clusters=n_clusters, + distance_params={"gamma": 0.00001}, + distance="soft_dtw", + random_state=RandomState(1), + ) + + assert not np.array_equal(init_func_no_window, init_func_window) From c0643c644bde1c0afc9ccefd6dd5662fa274082c Mon Sep 17 00:00:00 2001 From: chrisholder Date: Wed, 19 Nov 2025 22:54:25 +0000 Subject: [PATCH 3/8] expanded tests --- aeon/clustering/_cluster_initialisation.py | 48 ++++++++++++++-- aeon/clustering/tests/test_elastic_som.py | 51 +++++++++-------- aeon/clustering/tests/test_k_means.py | 8 +-- aeon/clustering/tests/test_k_medoids.py | 64 ++++++++-------------- 4 files changed, 95 insertions(+), 76 deletions(-) diff --git a/aeon/clustering/_cluster_initialisation.py b/aeon/clustering/_cluster_initialisation.py index 0c38655bd3..78995a0073 100644 --- a/aeon/clustering/_cluster_initialisation.py +++ b/aeon/clustering/_cluster_initialisation.py @@ -39,11 +39,33 @@ def _first_center_initialiser( def _random_values_center_initialiser( - *, X: np.ndarray, n_clusters: int, random_state: RandomState + *, X: np.ndarray, n_clusters: int, random_state: RandomState, **kwargs ): return random_state.rand(n_clusters, X.shape[-2], X.shape[-1]) +def _random_values_center_initialiser_indexes( + *, + X: np.ndarray, + n_clusters: int, + random_state: RandomState, + distance: str | Callable, + distance_params: dict, + n_jobs: int = 1, + **kwargs, +) -> np.ndarray: + """Random values initialisation that returns indexes. + + This generates random values and then finds the nearest time series in the + dataset to use as centroids. + """ + centers = random_state.rand(n_clusters, X.shape[-2], X.shape[-1]) + pw_dist = pairwise_distance( + X, centers, method=distance, n_jobs=n_jobs, **distance_params + ) + return pw_dist.argmin(axis=0) + + def _kmeans_plus_plus_center_initialiser_indexes( *, X: np.ndarray, @@ -272,7 +294,7 @@ def resolve_center_initialiser( ) initialiser_func = initialisers_dict[init] - if init in ("kmeans++", "kmedoids++"): + if init in ("kmeans++", "kmedoids++", "random_values"): # kmeans++ and kmedoids++ need additional parameters if distance is None or distance_params is None: raise ValueError( @@ -303,13 +325,26 @@ def resolve_center_initialiser( ) if use_indexes: - if init.ndim != 1: + if init.ndim == 1: + return init + elif init.ndim == 3: + # If values provided but indexes needed, snap to nearest + if distance is None or distance_params is None: + raise ValueError( + "distance and distance_params are required when passing " + "values to an estimator that expects indexes." + ) + pw_dist = pairwise_distance( + X, init, method=distance, n_jobs=n_jobs, **distance_params + ) + return pw_dist.argmin(axis=0) + else: raise ValueError( f"The value provided for init: {init} is " - f"invalid. Expected 1D array of shape ({n_clusters},), " - f"got {init.shape}." + f"invalid. Expected 1D array of shape ({n_clusters},) or " + f"3D array of shape ({n_clusters}, {X.shape[1]}, " + f"{X.shape[2]}), got {init.shape}." ) - return init else: if init.ndim == 1: raise ValueError( @@ -342,6 +377,7 @@ def resolve_center_initialiser( CENTER_INITIALISER_INDEXES = { "random": _random_center_initialiser_indexes, "first": _first_center_initialiser_indexes, + "random_values": _random_values_center_initialiser_indexes, "kmeans++": _kmeans_plus_plus_center_initialiser_indexes, "kmedoids++": _kmedoids_plus_plus_center_initialiser_indexes, } diff --git a/aeon/clustering/tests/test_elastic_som.py b/aeon/clustering/tests/test_elastic_som.py index 5d5ef47630..cba1b055a5 100644 --- a/aeon/clustering/tests/test_elastic_som.py +++ b/aeon/clustering/tests/test_elastic_som.py @@ -4,6 +4,7 @@ import pytest from aeon.clustering import ElasticSOM +from aeon.clustering._cluster_initialisation import CENTER_INITIALISERS from aeon.distances import dtw_distance, msm_alignment_path from aeon.distances._distance import ELASTIC_DISTANCES from aeon.testing.data_generation import make_example_3d_numpy @@ -37,26 +38,29 @@ def test_elastic_som_multivariate(): assert preds.shape == (10,) -def test_elastic_som_init(): +@pytest.mark.parametrize("init", list(CENTER_INITIALISERS.keys()) + ["ndarray"]) +def test_elastic_som_init(init): """Test ElasticSOM with a custom initialization.""" X = make_example_3d_numpy( n_cases=10, n_channels=5, n_timepoints=20, return_y=False, random_state=1 ) - labels = [] - for init in ["random", "kmeans++", "first"]: - clst = ElasticSOM(n_clusters=3, init=init, random_state=1, num_iterations=10) - clst.fit(X) - assert clst.labels_.shape == (10,) - assert clst.cluster_centers_.shape == (3, 5, 20) - labels.append(clst.labels_) + if init == "ndarray": + init = X[:3] - preds = clst.predict(X) - assert preds.shape == (10,) + clst = ElasticSOM(n_clusters=3, init=init, random_state=1, num_iterations=10) + clst.fit(X) + assert clst.labels_.shape == (10,) + assert clst.cluster_centers_.shape == (3, 5, 20) - # Check that the labels are different - assert not np.array_equal(labels[0], labels[1]) - assert not np.array_equal(labels[0], labels[2]) - assert not np.array_equal(labels[1], labels[2]) + preds = clst.predict(X) + assert preds.shape == (10,) + + +def test_elastic_som_init_invalid(): + """Test ElasticSOM with invalid initialization.""" + X = make_example_3d_numpy( + n_cases=10, n_channels=5, n_timepoints=20, return_y=False, random_state=1 + ) # Test invalid init with pytest.raises(ValueError): clst = ElasticSOM( @@ -64,23 +68,26 @@ def test_elastic_som_init(): ) clst.fit(X) + # Test more ndarrays than clusters + with pytest.raises(ValueError): + clst = ElasticSOM(n_clusters=3, init=X[:4], random_state=1, num_iterations=10) + clst.fit(X) + + +def test_elastic_som_custom_init(): + """Test ElasticSOM with custom initialization.""" + X = make_example_3d_numpy( + n_cases=10, n_channels=5, n_timepoints=20, return_y=False, random_state=1 + ) # Test custom ndarray init clst = ElasticSOM(n_clusters=3, init=X[:3], random_state=1, num_iterations=10) clst.fit(X) assert clst.labels_.shape == (10,) assert clst.cluster_centers_.shape == (3, 5, 20) - # Last labels is for "first" init - assert np.array_equal(clst.labels_, labels[-1]) - preds = clst.predict(X) assert preds.shape == (10,) - # Test more ndarrays than clusters - with pytest.raises(ValueError): - clst = ElasticSOM(n_clusters=3, init=X[:4], random_state=1, num_iterations=10) - clst.fit(X) - def test_elastic_som_decay_function(): """Test ElasticSOM with a custom decay function.""" diff --git a/aeon/clustering/tests/test_k_means.py b/aeon/clustering/tests/test_k_means.py index 3e0e902695..ce2d240e05 100644 --- a/aeon/clustering/tests/test_k_means.py +++ b/aeon/clustering/tests/test_k_means.py @@ -249,15 +249,11 @@ def test_k_mean_ba(distance, averaging_method): @pytest.mark.parametrize("distance", TEST_DISTANCE_WITH_CUSTOM_DISTANCE) -@pytest.mark.parametrize("init", ["random", "kmeans++", "first", "ndarray"]) +@pytest.mark.parametrize("init", list(CENTER_INITIALISERS.keys()) + ["ndarray"]) def test_k_mean_init(distance, init): """Test implementation of Kmeans.""" distance, params = distance - # Only kmeans++ needs test with different distances - if init != "kmeans++" and distance != "euclidean": - return - n_cases = 10 n_timepoints = 10 n_clusters = 4 @@ -443,7 +439,7 @@ def test_center_initialisers(): # random_values returns (n_clusters, n_channels) - it generates random values # not based on the input data structure if init_name == "random_values": - assert centers.shape == (n_clusters, X_train.shape[1]) + assert centers.shape == (n_clusters, X_train.shape[1], X_train.shape[2]) else: assert centers.shape == (n_clusters, X_train.shape[1], X_train.shape[2]) # Verify no duplicate centers diff --git a/aeon/clustering/tests/test_k_medoids.py b/aeon/clustering/tests/test_k_medoids.py index 789fd43048..28decce10e 100644 --- a/aeon/clustering/tests/test_k_medoids.py +++ b/aeon/clustering/tests/test_k_medoids.py @@ -1,8 +1,12 @@ """Tests for time series k-medoids.""" +from collections.abc import Callable + import numpy as np +import pytest from sklearn import metrics +from aeon.clustering._cluster_initialisation import CENTER_INITIALISERS from aeon.clustering._k_medoids import TimeSeriesKMedoids from aeon.datasets import load_basic_motions, load_gunpoint @@ -157,64 +161,40 @@ def check_value_in_every_cluster(num_clusters, initial_medoids): assert original_length == len(set(initial_medoids)) -def test_medoids_init(): +@pytest.mark.parametrize("init", list(CENTER_INITIALISERS.keys()) + ["ndarray"]) +def test_medoids_init(init): """Test implementation of Kmedoids.""" X_train, _ = load_gunpoint(split="train") X_train = X_train[:10] - num_clusters = 8 - # Test first initializer - kmedoids = TimeSeriesKMedoids( - random_state=1, - n_init=1, - max_iter=5, - init="first", - distance="euclidean", - n_clusters=num_clusters, - ) - kmedoids._check_params(X_train) - first_medoids_result = kmedoids._init(X_train) - check_value_in_every_cluster(num_clusters, first_medoids_result) + num_clusters = 3 - # Test random initializer - kmedoids = TimeSeriesKMedoids( - random_state=1, - n_init=1, - max_iter=5, - init="random", - distance="euclidean", - n_clusters=num_clusters, - ) - kmedoids._check_params(X_train) - random_medoids_result = kmedoids._init(X_train) - check_value_in_every_cluster(num_clusters, random_medoids_result) + if init == "ndarray": + # Generate random values (not indexes) to test snapping + rng = np.random.RandomState(1) + init = rng.rand(num_clusters, X_train.shape[1], X_train.shape[2]) - # Test kmedoids++ initializer + # Test initializer kmedoids = TimeSeriesKMedoids( random_state=1, n_init=1, max_iter=5, - init="kmedoids++", + init=init, distance="euclidean", n_clusters=num_clusters, ) kmedoids._check_params(X_train) - kmedoids_plus_plus_medoids_result = kmedoids._init(X_train) - check_value_in_every_cluster(num_clusters, kmedoids_plus_plus_medoids_result) + if isinstance(kmedoids._init, Callable): + medoids_result = kmedoids._init(X_train) + else: + medoids_result = kmedoids._init + check_value_in_every_cluster(num_clusters, medoids_result) - # Test build initializer - kmedoids = TimeSeriesKMedoids( - random_state=1, - n_init=1, - max_iter=5, - init="build", - distance="euclidean", - n_clusters=num_clusters, - ) - kmedoids._check_params(X_train) - kmedoids_build_result = kmedoids._init(X_train) - check_value_in_every_cluster(num_clusters, kmedoids_build_result) +def test_medoids_custom_init(): + """Test implementation of Kmedoids with custom init.""" + X_train, _ = load_gunpoint(split="train") + X_train = X_train[:10] # Test setting manual init centres num_clusters = 8 custom_init_centres = np.array([1, 2, 3, 4, 5, 6, 7, 8]) From ac900127cfb36c38f814e242051d9f94fb7c0a33 Mon Sep 17 00:00:00 2001 From: chrisholder Date: Thu, 20 Nov 2025 01:42:52 +0000 Subject: [PATCH 4/8] fixed medoids to use indexes --- aeon/clustering/_cluster_initialisation.py | 52 ++++++--------------- aeon/clustering/tests/test_elastic_som.py | 15 ------ aeon/clustering/tests/test_k_means.py | 10 ++-- aeon/clustering/tests/test_k_medoids.py | 54 ++++++++++++---------- 4 files changed, 49 insertions(+), 82 deletions(-) diff --git a/aeon/clustering/_cluster_initialisation.py b/aeon/clustering/_cluster_initialisation.py index 78995a0073..693749531a 100644 --- a/aeon/clustering/_cluster_initialisation.py +++ b/aeon/clustering/_cluster_initialisation.py @@ -44,28 +44,6 @@ def _random_values_center_initialiser( return random_state.rand(n_clusters, X.shape[-2], X.shape[-1]) -def _random_values_center_initialiser_indexes( - *, - X: np.ndarray, - n_clusters: int, - random_state: RandomState, - distance: str | Callable, - distance_params: dict, - n_jobs: int = 1, - **kwargs, -) -> np.ndarray: - """Random values initialisation that returns indexes. - - This generates random values and then finds the nearest time series in the - dataset to use as centroids. - """ - centers = random_state.rand(n_clusters, X.shape[-2], X.shape[-1]) - pw_dist = pairwise_distance( - X, centers, method=distance, n_jobs=n_jobs, **distance_params - ) - return pw_dist.argmin(axis=0) - - def _kmeans_plus_plus_center_initialiser_indexes( *, X: np.ndarray, @@ -325,26 +303,23 @@ def resolve_center_initialiser( ) if use_indexes: - if init.ndim == 1: - return init - elif init.ndim == 3: - # If values provided but indexes needed, snap to nearest - if distance is None or distance_params is None: - raise ValueError( - "distance and distance_params are required when passing " - "values to an estimator that expects indexes." - ) - pw_dist = pairwise_distance( - X, init, method=distance, n_jobs=n_jobs, **distance_params + if init.ndim != 1: + raise ValueError( + f"The value provided for init: {init} is " + f"invalid. Expected 1D array of shape ({n_clusters},), " + f"got {init.shape}." ) - return pw_dist.argmin(axis=0) - else: + if not np.issubdtype(init.dtype, np.integer): raise ValueError( f"The value provided for init: {init} is " - f"invalid. Expected 1D array of shape ({n_clusters},) or " - f"3D array of shape ({n_clusters}, {X.shape[1]}, " - f"{X.shape[2]}), got {init.shape}." + f"invalid. Expected an array of integers, got {init.dtype}." + ) + if init.min() < 0 or init.max() >= X.shape[0]: + raise ValueError( + f"The value provided for init: {init} is " + f"invalid. Values must be in the range [0, {X.shape[0]})." ) + return init else: if init.ndim == 1: raise ValueError( @@ -377,7 +352,6 @@ def resolve_center_initialiser( CENTER_INITIALISER_INDEXES = { "random": _random_center_initialiser_indexes, "first": _first_center_initialiser_indexes, - "random_values": _random_values_center_initialiser_indexes, "kmeans++": _kmeans_plus_plus_center_initialiser_indexes, "kmedoids++": _kmedoids_plus_plus_center_initialiser_indexes, } diff --git a/aeon/clustering/tests/test_elastic_som.py b/aeon/clustering/tests/test_elastic_som.py index cba1b055a5..0c21ee74ef 100644 --- a/aeon/clustering/tests/test_elastic_som.py +++ b/aeon/clustering/tests/test_elastic_som.py @@ -74,21 +74,6 @@ def test_elastic_som_init_invalid(): clst.fit(X) -def test_elastic_som_custom_init(): - """Test ElasticSOM with custom initialization.""" - X = make_example_3d_numpy( - n_cases=10, n_channels=5, n_timepoints=20, return_y=False, random_state=1 - ) - # Test custom ndarray init - clst = ElasticSOM(n_clusters=3, init=X[:3], random_state=1, num_iterations=10) - clst.fit(X) - assert clst.labels_.shape == (10,) - assert clst.cluster_centers_.shape == (3, 5, 20) - - preds = clst.predict(X) - assert preds.shape == (10,) - - def test_elastic_som_decay_function(): """Test ElasticSOM with a custom decay function.""" X = make_example_3d_numpy( diff --git a/aeon/clustering/tests/test_k_means.py b/aeon/clustering/tests/test_k_means.py index ce2d240e05..08ba7bc779 100644 --- a/aeon/clustering/tests/test_k_means.py +++ b/aeon/clustering/tests/test_k_means.py @@ -198,10 +198,12 @@ def test_k_mean_distances(distance): kmeans_params=curr_params, n_cases=80, n_channels=1, n_timepoints=10 ) - # Test parameters passed through kmeans - assert not np.array_equal( - with_param_kmeans.cluster_centers_, default_param_kmeans.cluster_centers_ - ) + if not isinstance(dist, Callable): + # Test parameters passed through kmeans + assert not np.array_equal( + with_param_kmeans.cluster_centers_, + default_param_kmeans.cluster_centers_, + ) @pytest.mark.parametrize("distance", TEST_DISTANCES_WITH_FULL_ALIGNMENT_PATH) diff --git a/aeon/clustering/tests/test_k_medoids.py b/aeon/clustering/tests/test_k_medoids.py index 28decce10e..8774b8e8c8 100644 --- a/aeon/clustering/tests/test_k_medoids.py +++ b/aeon/clustering/tests/test_k_medoids.py @@ -6,7 +6,7 @@ import pytest from sklearn import metrics -from aeon.clustering._cluster_initialisation import CENTER_INITIALISERS +from aeon.clustering._cluster_initialisation import CENTER_INITIALISER_INDEXES from aeon.clustering._k_medoids import TimeSeriesKMedoids from aeon.datasets import load_basic_motions, load_gunpoint @@ -161,7 +161,7 @@ def check_value_in_every_cluster(num_clusters, initial_medoids): assert original_length == len(set(initial_medoids)) -@pytest.mark.parametrize("init", list(CENTER_INITIALISERS.keys()) + ["ndarray"]) +@pytest.mark.parametrize("init", list(CENTER_INITIALISER_INDEXES.keys()) + ["indexes"]) def test_medoids_init(init): """Test implementation of Kmedoids.""" X_train, _ = load_gunpoint(split="train") @@ -169,10 +169,10 @@ def test_medoids_init(init): num_clusters = 3 - if init == "ndarray": - # Generate random values (not indexes) to test snapping + if init == "indexes": + # Generate random indexes rng = np.random.RandomState(1) - init = rng.rand(num_clusters, X_train.shape[1], X_train.shape[2]) + init = rng.choice(X_train.shape[0], num_clusters, replace=False) # Test initializer kmedoids = TimeSeriesKMedoids( @@ -191,25 +191,6 @@ def test_medoids_init(init): check_value_in_every_cluster(num_clusters, medoids_result) -def test_medoids_custom_init(): - """Test implementation of Kmedoids with custom init.""" - X_train, _ = load_gunpoint(split="train") - X_train = X_train[:10] - # Test setting manual init centres - num_clusters = 8 - custom_init_centres = np.array([1, 2, 3, 4, 5, 6, 7, 8]) - kmedoids = TimeSeriesKMedoids( - random_state=1, - n_init=1, - max_iter=5, - init=custom_init_centres, - distance="euclidean", - n_clusters=num_clusters, - ) - kmedoids.fit(X_train) - assert np.array_equal(kmedoids.cluster_centers_, X_train[custom_init_centres]) - - def _get_model_centres(data, distance, method="pam", distance_params=None): """Get the centres of a model.""" model = TimeSeriesKMedoids( @@ -238,3 +219,28 @@ def test_custom_distance_params(): data, distance="msm", distance_params={"window": 0.01} ) assert not np.array_equal(default_dist, custom_params_dist) + + +def test_medoids_init_invalid(): + """Test implementation of Kmedoids with invalid init.""" + X_train, _ = load_gunpoint(split="train") + X_train = X_train[:10] + num_clusters = 3 + + # Test float array + with pytest.raises(ValueError, match="Expected an array of integers"): + kmedoids = TimeSeriesKMedoids( + n_clusters=num_clusters, + init=np.array([0.5, 1.5, 2.5]), + random_state=1, + ) + kmedoids.fit(X_train) + + # Test out of bounds + with pytest.raises(ValueError, match="Values must be in the range"): + kmedoids = TimeSeriesKMedoids( + n_clusters=num_clusters, + init=np.array([0, 1, 100]), + random_state=1, + ) + kmedoids.fit(X_train) From c5405db7633c9ea4d44a096bed83a8a75bff40c8 Mon Sep 17 00:00:00 2001 From: chrisholder Date: Thu, 20 Nov 2025 02:07:48 +0000 Subject: [PATCH 5/8] fixed serialisation --- aeon/clustering/_clarans.py | 2 +- aeon/clustering/_cluster_initialisation.py | 154 +++++++++------------ aeon/clustering/_elastic_som.py | 2 +- aeon/clustering/_k_means.py | 2 +- aeon/clustering/_k_medoids.py | 4 +- aeon/clustering/tests/test_k_means.py | 4 +- aeon/clustering/tests/test_k_medoids.py | 2 +- 7 files changed, 76 insertions(+), 94 deletions(-) diff --git a/aeon/clustering/_clarans.py b/aeon/clustering/_clarans.py index 2ee497366b..6d2d8298f6 100644 --- a/aeon/clustering/_clarans.py +++ b/aeon/clustering/_clarans.py @@ -128,7 +128,7 @@ def _fit_one_init(self, X: np.ndarray, max_neighbours: int): j = 0 X_indexes = np.arange(X.shape[0], dtype=int) if isinstance(self._init, Callable): - best_medoids = self._init(X) + best_medoids = self._init(X=X) else: best_medoids = self._init best_non_medoids = np.setdiff1d(X_indexes, best_medoids) diff --git a/aeon/clustering/_cluster_initialisation.py b/aeon/clustering/_cluster_initialisation.py index 693749531a..f8ad39934c 100644 --- a/aeon/clustering/_cluster_initialisation.py +++ b/aeon/clustering/_cluster_initialisation.py @@ -1,4 +1,5 @@ from collections.abc import Callable +from functools import partial import numpy as np from numpy.random import RandomState @@ -223,123 +224,104 @@ def resolve_center_initialiser( custom_init_handlers: dict | None = None, use_indexes: bool = False, ) -> Callable | np.ndarray: - """Resolve the center initialiser function or array from init parameter. - - Parameters - ---------- - init : str or np.ndarray - Initialisation method string or array of initial centers/indexes. - X : np.ndarray - Input data for validation. - n_clusters : int - Number of clusters. - random_state : RandomState - Random state for initialisation. - initialisers_dict : dict - Dictionary of available initialisers (CENTER_INITIALISERS or - CENTER_INITIALISER_INDEXES). - distance : str or Callable, optional - Distance method (required for kmeans++/kmedoids++). - distance_params : dict, optional - Distance parameters (required for kmeans++/kmedoids++). - n_jobs : int, default=1 - Number of jobs for parallel processing (used for kmeans++/kmedoids++). - custom_init_handlers : dict, optional - Dictionary of custom initialisation handlers for special cases (e.g., - {"build": handler}). - use_indexes : bool, default=False - If True, expects 1D arrays (indexes). If False, expects multi-dimensional - arrays (centers). - - Returns - ------- - Callable or np.ndarray - Initialisation function or array. - """ + """Resolve the center initialiser function or array from init parameter.""" valid_init_methods = ", ".join(sorted(initialisers_dict.keys())) if isinstance(init, str): - # Check custom handlers first (e.g., "build" for k-medoids) + # Custom handlers first (e.g., "build" for k-medoids) if custom_init_handlers and init in custom_init_handlers: + # This is typically a bound method on the estimator (picklable) return custom_init_handlers[init] if init not in initialisers_dict: raise ValueError( - f"The value provided for init: {init} is " - f"invalid. The following are a list of valid init algorithms " + f"The value provided for init: {init} is invalid. " + f"The following are a list of valid init algorithms " f"strings: {valid_init_methods}. You can also pass a " f"np.ndarray of appropriate shape." ) initialiser_func = initialisers_dict[init] - if init in ("kmeans++", "kmedoids++", "random_values"): - # kmeans++ and kmedoids++ need additional parameters + + # Initialisers that need distance info + if init in ("kmeans++", "kmedoids++"): if distance is None or distance_params is None: raise ValueError( f"distance and distance_params are required for {init} " f"initialisation" ) - return lambda X: initialiser_func( - X=X, + # Return a partial of the top-level function (picklable) + return partial( + initialiser_func, n_clusters=n_clusters, random_state=random_state, distance=distance, distance_params=distance_params, n_jobs=n_jobs, ) - else: - # random, first, random_values only need basic parameters - return lambda X: initialiser_func( - X=X, + + # random_values doesn't need distance, just size + RNG + if init == "random_values": + return partial( + initialiser_func, n_clusters=n_clusters, random_state=random_state, ) - else: - if isinstance(init, np.ndarray): - if len(init) != n_clusters: - raise ValueError( - f"The value provided for init: {init} is " - f"invalid. Expected length {n_clusters}, got {len(init)}." - ) - if use_indexes: - if init.ndim != 1: - raise ValueError( - f"The value provided for init: {init} is " - f"invalid. Expected 1D array of shape ({n_clusters},), " - f"got {init.shape}." - ) - if not np.issubdtype(init.dtype, np.integer): - raise ValueError( - f"The value provided for init: {init} is " - f"invalid. Expected an array of integers, got {init.dtype}." - ) - if init.min() < 0 or init.max() >= X.shape[0]: - raise ValueError( - f"The value provided for init: {init} is " - f"invalid. Values must be in the range [0, {X.shape[0]})." - ) - return init - else: - if init.ndim == 1: - raise ValueError( - f"The value provided for init: {init} is " - f"invalid. Expected multi-dimensional array of shape " - f"({n_clusters}, {X.shape[1]}, {X.shape[2]}), got {init.shape}." - ) - if init.shape[1:] != X.shape[1:]: - raise ValueError( - f"The value provided for init: {init} is " - f"invalid. Expected shape ({n_clusters}, {X.shape[1]}, " - f"{X.shape[2]}), got {init.shape}." - ) - return init.copy() - else: + # "random", "first", etc. – basic initialisers + return partial( + initialiser_func, + n_clusters=n_clusters, + random_state=random_state, + ) + + # ---- np.ndarray branch (indexes / centers) ---- + if isinstance(init, np.ndarray): + if len(init) != n_clusters: raise ValueError( - f"The value provided for init: {init} is " - f"invalid. Expected a string or np.ndarray." + f"The value provided for init: {init} is invalid. " + f"Expected length {n_clusters}, got {len(init)}." ) + if use_indexes: + if init.ndim != 1: + raise ValueError( + f"The value provided for init: {init} is invalid. " + f"Expected 1D array of shape ({n_clusters},), " + f"got {init.shape}." + ) + if not np.issubdtype(init.dtype, np.integer): + raise ValueError( + f"The value provided for init: {init} is invalid. " + f"Expected an array of integers, got {init.dtype}." + ) + if init.min() < 0 or init.max() >= X.shape[0]: + raise ValueError( + f"The value provided for init: {init} is invalid. " + f"Values must be in the range [0, {X.shape[0]})." + ) + return init + else: + if init.ndim == 1: + raise ValueError( + f"The value provided for init: {init} is invalid. " + f"Expected multi-dimensional array of shape " + f"({n_clusters}, {X.shape[1]}, {X.shape[2]}), " + f"got {init.shape}." + ) + if init.shape[1:] != X.shape[1:]: + raise ValueError( + f"The value provided for init: {init} is invalid. " + f"Expected shape ({n_clusters}, {X.shape[1]}, " + f"{X.shape[2]}), got {init.shape}." + ) + return init.copy() + + raise ValueError( + f"The value provided for init: {init} is invalid. " + f"Expected a string or np.ndarray." + ) + CENTER_INITIALISERS = { "random": _random_center_initialiser, diff --git a/aeon/clustering/_elastic_som.py b/aeon/clustering/_elastic_som.py index 35c017bc12..547e14ecb0 100644 --- a/aeon/clustering/_elastic_som.py +++ b/aeon/clustering/_elastic_som.py @@ -207,7 +207,7 @@ def _fit(self, X, y=None): self._check_params(X) if isinstance(self._init, Callable): - weights = self._init(X) + weights = self._init(X=X) else: weights = self._init.copy() diff --git a/aeon/clustering/_k_means.py b/aeon/clustering/_k_means.py index 2feb6c54ec..5b8f954f6d 100644 --- a/aeon/clustering/_k_means.py +++ b/aeon/clustering/_k_means.py @@ -243,7 +243,7 @@ def _fit(self, X: np.ndarray, y=None): def _fit_one_init(self, X: np.ndarray) -> tuple: if isinstance(self._init, Callable): - cluster_centres = self._init(X) + cluster_centres = self._init(X=X) else: cluster_centres = self._init.copy() prev_inertia = np.inf diff --git a/aeon/clustering/_k_medoids.py b/aeon/clustering/_k_medoids.py index c431ba9faf..59face1e1a 100644 --- a/aeon/clustering/_k_medoids.py +++ b/aeon/clustering/_k_medoids.py @@ -273,7 +273,7 @@ def _pam_fit(self, X: np.ndarray): n_cases = X.shape[0] if isinstance(self._init, Callable): - medoids_idxs = self._init(X) + medoids_idxs = self._init(X=X) else: medoids_idxs = self._init not_medoid_idxs = np.arange(n_cases, dtype=int) @@ -393,7 +393,7 @@ def _compute_optimal_swaps( def _alternate_fit(self, X) -> tuple[np.ndarray, np.ndarray, float, int]: cluster_center_indexes = self._init if isinstance(self._init, Callable): - cluster_center_indexes = self._init(X) + cluster_center_indexes = self._init(X=X) old_inertia = np.inf old_indexes = None for i in range(self.max_iter): diff --git a/aeon/clustering/tests/test_k_means.py b/aeon/clustering/tests/test_k_means.py index 08ba7bc779..841431f213 100644 --- a/aeon/clustering/tests/test_k_means.py +++ b/aeon/clustering/tests/test_k_means.py @@ -288,7 +288,7 @@ def test_k_mean_init(distance, init): kmeans._check_params(X_train_uni) if isinstance(kmeans._init, Callable): - uni_init_vals = kmeans._init(X_train_uni) + uni_init_vals = kmeans._init(X=X_train_uni) else: uni_init_vals = kmeans._init @@ -323,7 +323,7 @@ def test_k_mean_init(distance, init): kmeans._check_params(X_train_multi) if isinstance(kmeans._init, Callable): - multi_init_vals = kmeans._init(X_train_multi) + multi_init_vals = kmeans._init(X=X_train_multi) else: multi_init_vals = kmeans._init diff --git a/aeon/clustering/tests/test_k_medoids.py b/aeon/clustering/tests/test_k_medoids.py index 8774b8e8c8..cb770249bf 100644 --- a/aeon/clustering/tests/test_k_medoids.py +++ b/aeon/clustering/tests/test_k_medoids.py @@ -185,7 +185,7 @@ def test_medoids_init(init): ) kmedoids._check_params(X_train) if isinstance(kmedoids._init, Callable): - medoids_result = kmedoids._init(X_train) + medoids_result = kmedoids._init(X=X_train) else: medoids_result = kmedoids._init check_value_in_every_cluster(num_clusters, medoids_result) From e4a5779b2785a22c72fda5e25d2af160c863a4f9 Mon Sep 17 00:00:00 2001 From: chrisholder Date: Thu, 20 Nov 2025 04:06:48 +0000 Subject: [PATCH 6/8] kmedoids++ support negative distances --- aeon/clustering/_cluster_initialisation.py | 70 +++++++++++++++++++--- 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/aeon/clustering/_cluster_initialisation.py b/aeon/clustering/_cluster_initialisation.py index f8ad39934c..b24c185173 100644 --- a/aeon/clustering/_cluster_initialisation.py +++ b/aeon/clustering/_cluster_initialisation.py @@ -172,21 +172,73 @@ def _kmedoids_plus_plus_center_initialiser_indexes( ) -> np.ndarray: """K-medoids++ initialisation that returns indexes. - This is a simpler variant of kmeans++ that uses minimum distances - directly as probabilities without the sophisticated weighting scheme. + This uses a k-means++-style seeding procedure, but with medoids, + and supports potentially negative distances by shifting the + distance distribution to be non-negative. """ - initial_center_idx = random_state.randint(X.shape[0]) + n_samples = X.shape[0] + initial_center_idx = random_state.randint(n_samples) indexes = [initial_center_idx] + # Initial minimum distances to the first medoid + min_distances = pairwise_distance( + X, + X[[initial_center_idx]], + method=distance, + n_jobs=n_jobs, + **distance_params, + ).reshape(n_samples) + for _ in range(1, n_clusters): - pw_dist = pairwise_distance( - X, X[indexes], method=distance, n_jobs=n_jobs, **distance_params - ) - min_distances = pw_dist.min(axis=1) - probabilities = min_distances / min_distances.sum() - next_center_idx = random_state.choice(X.shape[0], p=probabilities) + d = min_distances.copy() + chosen = np.asarray(indexes, dtype=int) + + finite_mask = np.isfinite(d) + if not np.any(finite_mask): + candidates = np.setdiff1d(np.arange(n_samples), chosen, assume_unique=False) + next_center_idx = random_state.choice(candidates) + else: + min_val = d[finite_mask].min() + w = d - min_val + + w[~np.isfinite(w)] = 0.0 + + w = np.clip(w, 0.0, None) + + w[chosen] = 0.0 + + total = w.sum() + if total <= 0.0: + candidates = np.setdiff1d( + np.arange(n_samples), chosen, assume_unique=False + ) + next_center_idx = random_state.choice(candidates) + else: + p = w / total + p = np.clip(p, 0.0, None) + p_sum = p.sum() + if p_sum <= 0.0: + candidates = np.setdiff1d( + np.arange(n_samples), chosen, assume_unique=False + ) + next_center_idx = random_state.choice(candidates) + else: + p = p / p_sum + next_center_idx = random_state.choice(n_samples, p=p) + indexes.append(next_center_idx) + new_distances = pairwise_distance( + X, + X[[next_center_idx]], + method=distance, + n_jobs=n_jobs, + **distance_params, + ).reshape(n_samples) + + closer_points = new_distances < min_distances + min_distances[closer_points] = new_distances[closer_points] + return np.array(indexes) From 819e124fcd10fed7af076691b08e315c5290ee90 Mon Sep 17 00:00:00 2001 From: chrisholder Date: Sat, 22 Nov 2025 03:07:47 +0000 Subject: [PATCH 7/8] added docstring and updated function params --- aeon/clustering/_cluster_initialisation.py | 69 ++++++++++++++++--- aeon/clustering/_elastic_som.py | 2 - aeon/clustering/_k_means.py | 2 - aeon/clustering/_k_medoids.py | 2 - .../tests/test_cluster_initialisation.py | 18 ++--- aeon/clustering/tests/test_elastic_som.py | 4 +- aeon/clustering/tests/test_k_means.py | 6 +- aeon/clustering/tests/test_k_medoids.py | 4 +- 8 files changed, 74 insertions(+), 33 deletions(-) diff --git a/aeon/clustering/_cluster_initialisation.py b/aeon/clustering/_cluster_initialisation.py index b24c185173..4691621694 100644 --- a/aeon/clustering/_cluster_initialisation.py +++ b/aeon/clustering/_cluster_initialisation.py @@ -1,3 +1,13 @@ +"""Initialisation strategies for clustering. + +This file contains the various initialisation algorithms that can be used +with clustering algorithms. + +The functions with "indexes" in their names return the indexes of the +initialised clusters, while the functions with "centers" return the +actual centers. +""" + from collections.abc import Callable from functools import partial @@ -269,20 +279,62 @@ def resolve_center_initialiser( X: np.ndarray, n_clusters: int, random_state: RandomState, - initialisers_dict: dict, distance: str | Callable | None = None, distance_params: dict | None = None, n_jobs: int = 1, custom_init_handlers: dict | None = None, use_indexes: bool = False, ) -> Callable | np.ndarray: - """Resolve the center initialiser function or array from init parameter.""" + """Resolve the center initialiser function or array from init parameter. + + Parameters + ---------- + X : 3D np.ndarray + Input data, any number of channels, equal length series of shape ``( + n_cases, n_channels, n_timepoints)`` + or 2D np.array (univariate, equal length series) of shape + ``(n_cases, n_timepoints)`` + or list of numpy arrays (any number of channels, unequal length series) + of shape ``[n_cases]``, 2D np.array ``(n_channels, n_timepoints_i)``, + where ``n_timepoints_i`` is length of series ``i``. Other types are + allowed and converted into one of the above. + n_clusters : int + The number of clusters to form as well as the number of centroids to generate. + random_state : RandomState + If `np.random.RandomState` instance, + distance : str or callable, optional + Distance method to compute similarity between time series. A list of valid + strings for measures can be found in the documentation for + :func:`aeon.distances.get_distance_function`. If a callable is passed it must be + a function that takes two 2d numpy arrays as input and returns a float. + distance_params : dict, default=None + Dictionary containing kwargs for the distance being used. For example if you + wanted to specify a window for DTW you would pass + distance_params={"window": 0.2}. See documentation of aeon.distances for more + details. + n_jobs : int, default=1 + The number of jobs to run in parallel. If -1, then the number of jobs is set + to the number of CPU cores. If 1, then the function is executed in a single + thread. If greater than 1, then the function is executed in parallel. + custom_init_handlers : dict, default=None + A dictionary of custom initialisation functions that can be used to initialise. + use_indexes : bool, default=False + Boolean when True initialisation that return indexes is returned, when false + return initialisation that returns the centres + + Returns + ------- + Callable | np.ndarray + If a ndarray is specific as init then the validated ndarray is returned, + If a string is passed then the corresponding function is returned. + """ + initialisers_dict = ( + _CENTRE_INITIALISERS if not use_indexes else _CENTRE_INITIALISER_INDEXES + ) valid_init_methods = ", ".join(sorted(initialisers_dict.keys())) if isinstance(init, str): - # Custom handlers first (e.g., "build" for k-medoids) if custom_init_handlers and init in custom_init_handlers: - # This is typically a bound method on the estimator (picklable) return custom_init_handlers[init] if init not in initialisers_dict: @@ -295,14 +347,12 @@ def resolve_center_initialiser( initialiser_func = initialisers_dict[init] - # Initialisers that need distance info if init in ("kmeans++", "kmedoids++"): if distance is None or distance_params is None: raise ValueError( f"distance and distance_params are required for {init} " f"initialisation" ) - # Return a partial of the top-level function (picklable) return partial( initialiser_func, n_clusters=n_clusters, @@ -312,7 +362,6 @@ def resolve_center_initialiser( n_jobs=n_jobs, ) - # random_values doesn't need distance, just size + RNG if init == "random_values": return partial( initialiser_func, @@ -320,14 +369,12 @@ def resolve_center_initialiser( random_state=random_state, ) - # "random", "first", etc. – basic initialisers return partial( initialiser_func, n_clusters=n_clusters, random_state=random_state, ) - # ---- np.ndarray branch (indexes / centers) ---- if isinstance(init, np.ndarray): if len(init) != n_clusters: raise ValueError( @@ -375,7 +422,7 @@ def resolve_center_initialiser( ) -CENTER_INITIALISERS = { +_CENTRE_INITIALISERS = { "random": _random_center_initialiser, "first": _first_center_initialiser, "random_values": _random_values_center_initialiser, @@ -383,7 +430,7 @@ def resolve_center_initialiser( "kmedoids++": _kmedoids_plus_plus_center_initialiser, } -CENTER_INITIALISER_INDEXES = { +_CENTRE_INITIALISER_INDEXES = { "random": _random_center_initialiser_indexes, "first": _first_center_initialiser_indexes, "kmeans++": _kmeans_plus_plus_center_initialiser_indexes, diff --git a/aeon/clustering/_elastic_som.py b/aeon/clustering/_elastic_som.py index 547e14ecb0..40fc87135f 100644 --- a/aeon/clustering/_elastic_som.py +++ b/aeon/clustering/_elastic_som.py @@ -10,7 +10,6 @@ from sklearn.utils.random import check_random_state from aeon.clustering._cluster_initialisation import ( - CENTER_INITIALISERS, resolve_center_initialiser, ) from aeon.clustering.base import BaseClusterer @@ -272,7 +271,6 @@ def _check_params(self, X): X=X, n_clusters=self.n_clusters, random_state=self._random_state, - initialisers_dict=CENTER_INITIALISERS, distance=self.distance, distance_params=self._distance_params, n_jobs=1, diff --git a/aeon/clustering/_k_means.py b/aeon/clustering/_k_means.py index 5b8f954f6d..78b40b55ec 100644 --- a/aeon/clustering/_k_means.py +++ b/aeon/clustering/_k_means.py @@ -10,7 +10,6 @@ from sklearn.utils import check_random_state from aeon.clustering._cluster_initialisation import ( - CENTER_INITIALISERS, resolve_center_initialiser, ) from aeon.clustering.averaging import ( @@ -314,7 +313,6 @@ def _check_params(self, X: np.ndarray) -> None: X=X, n_clusters=self.n_clusters, random_state=self._random_state, - initialisers_dict=CENTER_INITIALISERS, distance=self.distance, distance_params=self._distance_params, n_jobs=self._n_jobs, diff --git a/aeon/clustering/_k_medoids.py b/aeon/clustering/_k_medoids.py index 59face1e1a..0825e31cd1 100644 --- a/aeon/clustering/_k_medoids.py +++ b/aeon/clustering/_k_medoids.py @@ -12,7 +12,6 @@ from sklearn.utils import check_random_state from aeon.clustering._cluster_initialisation import ( - CENTER_INITIALISER_INDEXES, resolve_center_initialiser, ) from aeon.clustering.base import BaseClusterer @@ -441,7 +440,6 @@ def _check_params(self, X: np.ndarray) -> None: X=X, n_clusters=self.n_clusters, random_state=self._random_state, - initialisers_dict=CENTER_INITIALISER_INDEXES, distance=self.distance, distance_params=self._distance_params, n_jobs=1, diff --git a/aeon/clustering/tests/test_cluster_initialisation.py b/aeon/clustering/tests/test_cluster_initialisation.py index d69fe9e923..922e36d33d 100644 --- a/aeon/clustering/tests/test_cluster_initialisation.py +++ b/aeon/clustering/tests/test_cluster_initialisation.py @@ -7,8 +7,8 @@ from numpy.random import RandomState from aeon.clustering._cluster_initialisation import ( - CENTER_INITIALISER_INDEXES, - CENTER_INITIALISERS, + _CENTRE_INITIALISER_INDEXES, + _CENTRE_INITIALISERS, ) from aeon.distances._distance import ELASTIC_DISTANCES, POINTWISE_DISTANCES from aeon.testing.data_generation import make_example_3d_numpy @@ -54,7 +54,7 @@ def _run_initialisation_test( assert np.allclose(values, value_from_indexes) -@pytest.mark.parametrize("init_key", CENTER_INITIALISERS.keys()) +@pytest.mark.parametrize("init_key", _CENTRE_INITIALISERS.keys()) def test_center_initialisers(init_key): """Test all center initialisers.""" params = {} @@ -64,8 +64,8 @@ def test_center_initialisers(init_key): _run_initialisation_test( key=init_key, - init_func=CENTER_INITIALISERS[init_key], - init_func_indexes=CENTER_INITIALISER_INDEXES.get(init_key, None), + init_func=_CENTRE_INITIALISERS[init_key], + init_func_indexes=_CENTRE_INITIALISER_INDEXES.get(init_key, None), init_func_params=params, ) @@ -80,8 +80,8 @@ def test_distance_center_initialisers(init_key, dist): } _run_initialisation_test( key=init_key, - init_func=CENTER_INITIALISERS[init_key], - init_func_indexes=CENTER_INITIALISER_INDEXES.get(init_key, None), + init_func=_CENTRE_INITIALISERS[init_key], + init_func_indexes=_CENTRE_INITIALISER_INDEXES.get(init_key, None), init_func_params=params, ) @@ -92,7 +92,7 @@ def test_distance_center_initialisers_params(init_key): n_clusters = 3 X = make_example_3d_numpy(50, 1, 10, random_state=1, return_y=False) - init_func_no_window = CENTER_INITIALISERS[init_key]( + init_func_no_window = _CENTRE_INITIALISERS[init_key]( X=X, n_clusters=n_clusters, distance_params={}, @@ -100,7 +100,7 @@ def test_distance_center_initialisers_params(init_key): random_state=RandomState(1), ) - init_func_window = CENTER_INITIALISERS[init_key]( + init_func_window = _CENTRE_INITIALISERS[init_key]( X=X, n_clusters=n_clusters, distance_params={"gamma": 0.00001}, diff --git a/aeon/clustering/tests/test_elastic_som.py b/aeon/clustering/tests/test_elastic_som.py index 0c21ee74ef..530ba7116c 100644 --- a/aeon/clustering/tests/test_elastic_som.py +++ b/aeon/clustering/tests/test_elastic_som.py @@ -4,7 +4,7 @@ import pytest from aeon.clustering import ElasticSOM -from aeon.clustering._cluster_initialisation import CENTER_INITIALISERS +from aeon.clustering._cluster_initialisation import _CENTRE_INITIALISERS from aeon.distances import dtw_distance, msm_alignment_path from aeon.distances._distance import ELASTIC_DISTANCES from aeon.testing.data_generation import make_example_3d_numpy @@ -38,7 +38,7 @@ def test_elastic_som_multivariate(): assert preds.shape == (10,) -@pytest.mark.parametrize("init", list(CENTER_INITIALISERS.keys()) + ["ndarray"]) +@pytest.mark.parametrize("init", list(_CENTRE_INITIALISERS.keys()) + ["ndarray"]) def test_elastic_som_init(init): """Test ElasticSOM with a custom initialization.""" X = make_example_3d_numpy( diff --git a/aeon/clustering/tests/test_k_means.py b/aeon/clustering/tests/test_k_means.py index 841431f213..67bafee9f7 100644 --- a/aeon/clustering/tests/test_k_means.py +++ b/aeon/clustering/tests/test_k_means.py @@ -7,7 +7,7 @@ from sklearn import metrics from aeon.clustering import TimeSeriesKMeans -from aeon.clustering._cluster_initialisation import CENTER_INITIALISERS +from aeon.clustering._cluster_initialisation import _CENTRE_INITIALISERS from aeon.datasets import load_basic_motions from aeon.distances._distance import ELASTIC_DISTANCES from aeon.testing.data_generation import make_example_3d_numpy @@ -251,7 +251,7 @@ def test_k_mean_ba(distance, averaging_method): @pytest.mark.parametrize("distance", TEST_DISTANCE_WITH_CUSTOM_DISTANCE) -@pytest.mark.parametrize("init", list(CENTER_INITIALISERS.keys()) + ["ndarray"]) +@pytest.mark.parametrize("init", list(_CENTRE_INITIALISERS.keys()) + ["ndarray"]) def test_k_mean_init(distance, init): """Test implementation of Kmeans.""" distance, params = distance @@ -418,7 +418,7 @@ def test_center_initialisers(): random_state = RandomState(1) # Test all available initializers - for init_name, initialiser_func in CENTER_INITIALISERS.items(): + for init_name, initialiser_func in _CENTRE_INITIALISERS.items(): if init_name == "kmeans++" or init_name == "kmedoids++": # kmeans++ and kmedoids++ needs additional parameters centers = initialiser_func( diff --git a/aeon/clustering/tests/test_k_medoids.py b/aeon/clustering/tests/test_k_medoids.py index cb770249bf..6864a09465 100644 --- a/aeon/clustering/tests/test_k_medoids.py +++ b/aeon/clustering/tests/test_k_medoids.py @@ -6,7 +6,7 @@ import pytest from sklearn import metrics -from aeon.clustering._cluster_initialisation import CENTER_INITIALISER_INDEXES +from aeon.clustering._cluster_initialisation import _CENTRE_INITIALISER_INDEXES from aeon.clustering._k_medoids import TimeSeriesKMedoids from aeon.datasets import load_basic_motions, load_gunpoint @@ -161,7 +161,7 @@ def check_value_in_every_cluster(num_clusters, initial_medoids): assert original_length == len(set(initial_medoids)) -@pytest.mark.parametrize("init", list(CENTER_INITIALISER_INDEXES.keys()) + ["indexes"]) +@pytest.mark.parametrize("init", list(_CENTRE_INITIALISER_INDEXES.keys()) + ["indexes"]) def test_medoids_init(init): """Test implementation of Kmedoids.""" X_train, _ = load_gunpoint(split="train") From 28c8cac33d851471620a3d0bcbbb5d5ed6187f94 Mon Sep 17 00:00:00 2001 From: chrisholder Date: Sat, 22 Nov 2025 03:14:17 +0000 Subject: [PATCH 8/8] minor --- aeon/clustering/_cluster_initialisation.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/aeon/clustering/_cluster_initialisation.py b/aeon/clustering/_cluster_initialisation.py index 4691621694..d950eed5cc 100644 --- a/aeon/clustering/_cluster_initialisation.py +++ b/aeon/clustering/_cluster_initialisation.py @@ -374,8 +374,7 @@ def resolve_center_initialiser( n_clusters=n_clusters, random_state=random_state, ) - - if isinstance(init, np.ndarray): + elif isinstance(init, np.ndarray): if len(init) != n_clusters: raise ValueError( f"The value provided for init: {init} is invalid. " @@ -415,11 +414,11 @@ def resolve_center_initialiser( f"{X.shape[2]}), got {init.shape}." ) return init.copy() - - raise ValueError( - f"The value provided for init: {init} is invalid. " - f"Expected a string or np.ndarray." - ) + else: + raise ValueError( + f"The value provided for init: {init} is invalid. " + f"Expected a string or np.ndarray." + ) _CENTRE_INITIALISERS = {