[ENH] Add data loader for EEG classification datasets (#107)

TonyBagnall · web-flow · commit 52c98a25cbb4 · 2025-08-27T14:19:48.000+01:00
* dataset lists

* data loader

* data loader

* data loader

* data loader
diff --git a/aeon_neuro/datasets/__init__.py b/aeon_neuro/datasets/__init__.py
@@ -1,8 +1,9 @@
 """Utilities for loading datasets."""
 
 __maintainer__ = ["TonyBagnall"]
-__all__ = ["load_kdd_example", "load_kdd_full_example"]
+__all__ = ["load_kdd_example", "load_kdd_full_example", "load_eeg_classification"]
 
+from aeon_neuro.datasets._data_loaders import load_eeg_classification
 from aeon_neuro.datasets._single_problem_loaders import (
     load_kdd_example,
     load_kdd_full_example,
diff --git a/aeon_neuro/datasets/_data_loaders.py b/aeon_neuro/datasets/_data_loaders.py
@@ -0,0 +1,145 @@
+"""Function to load EEG Datasets from Zenodo."""
+
+import os
+from urllib.request import urlretrieve
+
+from aeon.datasets._single_problem_loaders import _load_saved_dataset
+from aeon.datasets.dataset_collections import get_downloaded_tsc_tsr_datasets
+
+import aeon_neuro
+from aeon_neuro.datasets.classification_datasets import dataset_map
+
+DIRNAME = "data"
+MODULE = os.path.join(os.path.dirname(aeon_neuro.__file__), "datasets")
+
+
+def load_eeg_classification(
+    name,
+    split=None,
+    extract_path=None,
+    return_metadata=False,
+):
+    """Load an EEG classification dataset.
+
+    This function loads EEG TSC problems into memory, attempting to load from the
+    specified local path `extract_path`` or trying to download from
+    https://zenodo.org// if the data is not in the local path. To download from
+    zenodo, the dataset must be in the list ``dataset_map`` in data._data_loaders.py.
+    This function assumes the data is stored in format
+    ``<extract_path>/<name>/<name>_TRAIN.ts`` and
+    ``<extract_path>/<name>/<name>_TEST.ts.`` If you want to load a file directly
+    from a full path that is in ``aeon`` ts format, use the function
+    `load_from_ts_file`` in ``aeon`` directly. If
+    you do not specify ``extract_path``, it will set the path to
+    ``aeon_neuro/datasets/local_data``.
+
+    Data is assumed to be in the standard ``aeon`` .ts format: each row is a (possibly
+    multivariate) time series. Each channel is separated by a colon, each value in
+    a series is comma separated. For examples see aeon_neuro.datasets.data.
+
+    Parameters
+    ----------
+    name : str
+        Name of data set. If a dataset that is listed in tsc_datasets is given,
+        this function will look in the extract_path first, and if it is not present,
+        attempt to download the data from www.timeseriesclassification.com, saving it to
+        the extract_path.
+    split : None or str{"train", "test"}, default=None
+        Whether to load the train or test partition of the problem. By default it
+        loads both into a single dataset, otherwise it looks only for files of the
+        format <name>_TRAIN.ts or <name>_TEST.ts.
+    extract_path : str, default=None
+        the path to look for the data. If no path is provided, the function
+        looks in `aeon/datasets/local_data/`. If a path is given, it can be absolute,
+        e.g. C:/Temp/ or relative, e.g. Temp/ or ./Temp/.
+    return_metadata : boolean, default = True
+        If True, returns a tuple (X, y, metadata)
+
+    Returns
+    -------
+    X: np.ndarray or list of np.ndarray
+    y: np.ndarray
+        The class labels for each case in X
+    metadata: dict, optional
+        returns the following metadata
+        'problemname',timestamps, missing,univariate,equallength, class_values
+        targetlabel should be false, and classlabel true
+
+    Raises
+    ------
+    URLError or HTTPError
+        If the website is not accessible.
+    ValueError
+        If a dataset name that does not exist on the repo is given or if a
+        webpage is requested that does not exist.
+
+    Examples
+    --------
+    >>> from aeon.datasets import load_classification
+    >>> X, y = load_classification(name="ArrowHead")  # doctest: +SKIP
+    """
+    if extract_path is not None:
+        local_module = extract_path
+        local_dirname = None
+    else:
+        local_module = MODULE
+        local_dirname = "data"
+    if local_dirname is None:
+        path = local_module
+    else:
+        path = os.path.join(local_module, local_dirname)
+    if not os.path.exists(path):
+        os.makedirs(path)
+    if name not in get_downloaded_tsc_tsr_datasets(path):
+        if extract_path is None:
+            local_dirname = "local_data"
+            path = os.path.join(local_module, local_dirname)
+        else:
+            path = extract_path
+        if not os.path.exists(path):
+            os.makedirs(path)
+        error_str = (
+            f"File name {name} is not in the list of valid files to download,"
+            f"see aeon_neuro.datasets.classification for the current list of "
+            f"maintained datasets."
+        )
+
+        if name not in get_downloaded_tsc_tsr_datasets(path):
+            # Check if in the zenodo list
+            if name in dataset_map.keys():
+                id = dataset_map[name]
+                if id == 49:
+                    raise ValueError(error_str)
+                url_train = f"https://zenodo.org/record/{id}/files/{name}_TRAIN.ts"
+                url_test = f"https://zenodo.org/record/{id}/files/{name}_TEST.ts"
+                full_path = os.path.join(path, name)
+                if not os.path.exists(full_path):
+                    os.makedirs(full_path)
+                train_save = f"{full_path}/{name}_TRAIN.ts"
+                test_save = f"{full_path}/{name}_TEST.ts"
+                try:
+                    urlretrieve(url_train, train_save)
+                    urlretrieve(url_test, test_save)
+                except Exception:
+                    raise ValueError(error_str)
+            else:
+                raise ValueError(error_str)
+    X, y, meta = _load_saved_dataset(
+        name=name,
+        dir_name=name,
+        split=split,
+        local_module=local_module,
+        local_dirname=local_dirname,
+        return_meta=True,
+    )
+    # Check this is a classification problem
+    if "classlabel" not in meta or not meta["classlabel"]:
+        raise ValueError(
+            f"You have tried to load a regression problem called {name} with "
+            f"load_classifier. This will cause unintended consequences for any "
+            f"classifier you build. If you want to load a regression problem, "
+            f"use load_regression in ``aeon`` "
+        )
+    if return_metadata:
+        return X, y, meta
+    return X, y
diff --git a/aeon_neuro/datasets/classification_datasets.py b/aeon_neuro/datasets/classification_datasets.py
@@ -0,0 +1,71 @@
+"""Datasets in the EEG classification archive."""
+
+# 31 EEG Classification problems
+
+dataset_names = [
+    "Alzhiemers",
+    "Blink",
+    "ButtonPress",
+    "Epilepsy2",
+    "EyesOpenShut",
+    "FaceDetection",
+    "FingerMovements",
+    "HandMovementDirection",
+    "ImaginedOpenCloseFist",
+    "ImaginedOpenCloseFistFeet",
+    "InnerSpeech",
+    "Liverpool-Fibromyalgia",
+    "LongIntervalTask",
+    "LowCostEEG",
+    "MatchingPennies",
+    "MindReading",
+    "MotorImagery",
+    "N_Back",
+    "OpenCloseFist",
+    "OpenCloseFistFeet",
+    "Photo-Stimulation",
+    "PronouncedSpeech",
+    "PsychologyButtonPress",
+    "SelfRegulationSCP1",
+    "SelfRegulationSCP2",
+    "ShortIntervalTask",
+    "SitStand",
+    "Sleep",
+    "SongFamiliarity",
+    "VIPA",
+    "VisualSpeech",
+]
+# Complete with zenodo number when available. 49 means not available yet
+dataset_map = {
+    "Alzhiemers": 49,
+    "Blink": 49,
+    "ButtonPress": 49,
+    "Epilepsy2": 49,
+    "EyesOpenShut": 49,
+    "FaceDetection": 49,
+    "FingerMovements": 49,
+    "HandMovementDirection": 49,
+    "ImaginedOpenCloseFist": 49,
+    "ImaginedOpenCloseFistFeet": 49,
+    "InnerSpeech": 49,
+    "Liverpool-Fibromyalgia": 49,
+    "LongIntervalTask": 49,
+    "LowCostEEG": 49,
+    "MatchingPennies": 49,
+    "MindReading": 49,
+    "MotorImagery": 49,
+    "N_Back": 49,
+    "OpenCloseFist": 49,
+    "OpenCloseFistFeet": 49,
+    "Photo-Stimulation": 49,
+    "PronouncedSpeech": 49,
+    "PsychologyButtonPress": 49,
+    "SelfRegulationSCP1": 49,
+    "SelfRegulationSCP2": 49,
+    "ShortIntervalTask": 49,
+    "SitStand": 49,
+    "Sleep": 49,
+    "SongFamiliarity": 49,
+    "VIPA": 49,
+    "VisualSpeech": 49,
+}
diff --git a/aeon_neuro/datasets/tests/__init__.py b/aeon_neuro/datasets/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for loaders."""
diff --git a/aeon_neuro/datasets/tests/test_data_loaders.py b/aeon_neuro/datasets/tests/test_data_loaders.py
@@ -0,0 +1,11 @@
+"""Test data loading with shipped data."""
+
+from aeon_neuro.datasets._data_loaders import load_eeg_classification
+
+
+def test_load_eeg():
+    """Test data loading from provided datasets."""
+    X, y = load_eeg_classification("SelfRegulationSCP1")
+    assert X.shape == (561, 6, 896)
+    X, y, meta = load_eeg_classification("SelfRegulationSCP1", return_metadata=True)
+    assert meta["problemname"] == "selfregulationscp1"