refacto: prefer optim group lists instead of a single dict selector -> group

percevalw · percevalw · commit 350e8d9f75d1 · 2025-08-27T10:02:00.000+02:00
diff --git a/changelog.md b/changelog.md
@@ -10,6 +10,7 @@
 - LinearSchedule (mostly used for LR scheduling) now allows a `end_value` parameter to configure if the learning rate should decay to zero or another value.
 - New `eds.explode` pipe that splits one document into multiple documents, one per span yielded by its `span_getter` parameter, each new document containing exactly that single span.
 - New `Training a span classifier` tutorial, and reorganized deep-learning docs
+- `ScheduledOptimizer` now warns when a parameter selector does not match any parameter.
 
 ## Fixed
 
@@ -24,6 +25,7 @@
 
 - Sections cues in `eds.history` are now section titles, and not the full section.
 - :boom: Validation metrics are now found under the root field `validation` in the training logs (e.g. `metrics['validation']['ner']['micro']['f']`)
+- It is now recommended to define optimizer groups of `ScheduledOptimizer` as a list of dicts of optim hyper-parameters, each containing a `selector` regex key, rather than as a single dict with a `selector` as keys and a dict of optim hyper-parameters as values. This allows for more flexibility in defining the optimizer groups, and is more consistent with the rest of the EDS-NLP API. This makes it easier to reference groups values from other places in config files, since their path doesn't contain a complex regex string anymore. See the updated training tutorials for more details.
 
 ## v0.17.2 (2025-06-25)
 
diff --git a/docs/tutorials/training-ner.md b/docs/tutorials/training-ner.md
@@ -120,14 +120,14 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li
       groups:
         # Assign parameters starting with transformer (ie the parameters of the transformer component)
         # to a first group
-        "^transformer":
+        - selector: "ner[.]embedding[.]embedding"
           lr:
             '@schedules': linear
             "warmup_rate": 0.1
             "start_value": 0
             "max_value": 5e-5
         # And every other parameters to the second group
-        "":
+        - selector: ".*"
           lr:
             '@schedules': linear
             "warmup_rate": 0.1
diff --git a/docs/tutorials/training-span-classifier.md b/docs/tutorials/training-span-classifier.md
@@ -126,13 +126,15 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li
       "@core": optimizer !draft  # (2)!
       optim: torch.optim.AdamW
       groups:
-        'biopsy_classifier[.]embedding':
+        # Small learning rate for the pretrained transformer model
+        - selector: 'biopsy_classifier[.]embedding[.]embedding'
           lr:
             '@schedules': linear
             warmup_rate: 0.1
             start_value: 0.
             max_value: 5e-5
-        '.*':
+        # Larger learning rate for the rest of the model
+        - selector: '.*'
           lr:
             '@schedules': linear
             warmup_rate: 0.1
diff --git a/edsnlp/training/optimizer.py b/edsnlp/training/optimizer.py
@@ -1,4 +1,5 @@
 import importlib
+import warnings
 from collections import defaultdict
 from typing import (
     Any,
@@ -166,7 +167,9 @@ def __init__(
         optim: Union[torch.optim.Optimizer, Type[torch.optim.Optimizer], str],
         module: Optional[Union[PipelineProtocol, torch.nn.Module]] = None,
         total_steps: Optional[int] = None,
-        groups: Optional[Dict[str, Union[Dict, Literal[False]]]] = None,
+        groups: Optional[
+            Union[List[Dict], Dict[str, Union[Dict, Literal[False]]]]
+        ] = None,
         init_schedules: bool = True,
         **kwargs,
     ):
@@ -183,13 +186,17 @@ def __init__(
         optim = ScheduledOptimizer(
             cls="adamw",
             module=model,
-            groups={
+            groups=[
                 # Exclude all parameters matching 'bias' from optimization.
-                "bias": False,
-                # Parameters starting with 'transformer' receive this learning rate
+                {
+                    "selector": "bias",
+                    "exclude": True,
+                },
+                # Parameters of the NER module's embedding receive this learning rate
                 # schedule. If a parameter matches both 'transformer' and 'ner',
-                # the 'transformer' settings take precedence due to the order.
-                "^transformer": {
+                # the first group settings take precedence due to the order.
+                {
+                    "selector": "^ner[.]embedding"
                     "lr": {
                         "@schedules": "linear",
                         "start_value": 0.0,
@@ -199,7 +206,8 @@ def __init__(
                 },
                 # Parameters starting with 'ner' receive this learning rate schedule,
                 # unless a 'lr' value has already been set by an earlier selector.
-                "^ner": {
+                {
+                    "selector": "^ner"
                     "lr": {
                         "@schedules": "linear",
                         "start_value": 0.0,
@@ -209,10 +217,11 @@ def __init__(
                 },
                 # Apply a weight_decay of 0.01 to all parameters not excluded.
                 # This setting doesn't conflict with others and applies to all.
-                "": {
+                {
+                    "selector": "",
                     "weight_decay": 0.01,
                 },
-            },
+            ],
             total_steps=1000,
         )
         ```
@@ -221,24 +230,28 @@ def __init__(
         ----------
         optim : Union[str, Type[torch.optim.Optimizer], torch.optim.Optimizer]
             The optimizer to use. If a string (like "adamw") or a type to instantiate,
-            the`module` and `groups` must be provided.
+            the `module` and `groups` must be provided.
         module : Optional[Union[PipelineProtocol, torch.nn.Module]]
             The module to optimize. Usually the `nlp` pipeline object.
         total_steps : Optional[int]
             The total number of steps, used for schedules.
-        groups : Optional[Dict[str, Group]]
-            The groups to optimize. The key is a regex selector to match parameters in
-            `module.named_parameters()` and the value is a dictionary with the keys
-            `params` and `schedules`.
-
-            The matching is performed by running  `regex.search(selector, name)` so you
-            do not have to match the full name. Note that the order of dict keys
-            matter. If a parameter name matches multiple selectors, the
+        groups : Optional[List[Group]]
+            The groups to optimize. Each group is a dictionary containing:
+
+            - a regex `selector` key to match the parameter of that group by their names
+              (as listed by `nlp.named_parameters()`)
+            - and several other keys that define the optimizer parameters for that
+              group, such as `lr`, `weight_decay` etc. The value for these keys can
+              be a `Schedule` instance or a simple value
+            - an `exclude` key that can be set to True to exclude parameters
+
+            The matching is performed by running `regex.search(selector, name)` so you
+            do not have to match the full name. Note that the order of the groups
+            matters. If a parameter name matches multiple selectors, the
             configurations of these selectors are combined in reverse order (from the
             last matched selector to the first), allowing later selectors to complete
-            options from earlier ones. If a selector maps to `False`, any parameters
-            matching it are excluded from optimization and not included in any parameter
-            group.
+            options from earlier ones. If a selector contains `exclude=True`, any
+            parameter matching it is excluded from optimization.
         """
         should_instantiate_optim = isinstance(optim, str) or isinstance(optim, type)
         if should_instantiate_optim and (groups is None or module is None):
@@ -257,6 +270,15 @@ def __init__(
         if should_instantiate_optim:
             named_parameters = list(module.named_parameters())
             groups = Config.resolve(groups, registry=edsnlp.registry)
+
+            # New groups format
+            if isinstance(groups, list):
+                groups = [dict(g) for g in groups]
+                groups = {
+                    g.pop("selector"): g if not g.get("exclude") else False
+                    for g in groups
+                }
+
             groups = {
                 sel: dict(group) if group else False for sel, group in groups.items()
             }
@@ -268,8 +290,20 @@ def __init__(
                     )
                 )
             groups_to_params = defaultdict(lambda: [])
+            empty_selectors = {sel for sel in groups}
             for params, group in param_to_groups.items():
                 groups_to_params[group].append(params)
+                for sel in group:
+                    empty_selectors.discard(sel)
+
+            if empty_selectors:
+                warnings.warn(
+                    f"Selectors {list(empty_selectors)} did not match any parameters."
+                )
+                warnings.warn(
+                    "For reference, here are the parameters of the module:\n"
+                    + "\n".join("- " + name for name, _ in named_parameters)
+                )
 
             cliques = []
             for selectors, params in groups_to_params.items():
diff --git a/tests/training/ner_qlf_diff_bert_config.yml b/tests/training/ner_qlf_diff_bert_config.yml
@@ -64,8 +64,10 @@ optimizer:
   optim: torch.optim.AdamW
   module: ${ nlp }
   groups:
-    "^transformer": false
-    ".*":
+    # Transformer
+    - selector: "ner[.]embedding[.]embedding"
+      exclude: true
+    - selector: ".*"
       lr:
           "@schedules": linear
           start_value: 1e-3
diff --git a/tests/training/ner_qlf_same_bert_config.yml b/tests/training/ner_qlf_same_bert_config.yml
@@ -60,8 +60,10 @@ optimizer:
   optim: AdamW
   module: ${ nlp }
   groups:
-    "^transformer": false
-    ".*":
+    # Transformer
+    - selector: "transformer"
+      exclude: true
+    - selector: ".*"
       lr: 1e-3
 
 # 📚 DATA
diff --git a/tests/training/test_optimizer.py b/tests/training/test_optimizer.py
@@ -68,6 +68,28 @@ def net():
                 "weight_decay": 0.0,
             },
         },
+        # New groups format
+        [
+            {
+                "selector": "fc1[.].*",
+                "lr": {
+                    "@schedules": "linear",
+                    "start_value": 0.0,
+                    "max_value": 0.1,
+                    "warmup_rate": 0.2,
+                },
+                "weight_decay": 0.01,
+            },
+            {
+                "selector": "fc2[.]bias",
+                "exclude": True,
+            },
+            {
+                "selector": "",
+                "lr": 0.0001,
+                "weight_decay": 0.0,
+            },
+        ],
     ],
 )
 def test_old_parameter_selection(net, groups):
@@ -172,3 +194,25 @@ def test_repr(net):
     optim.initialize()
 
     assert "ScheduledOptimizer[AdamW]" in repr(optim)
+
+
+def test_warn_empty_selector(net):
+    with pytest.warns(
+        UserWarning,
+        match="Selectors ['fc3[.].*'] did not match any parameters.",
+    ):
+        ScheduledOptimizer(
+            optim="adamw",
+            module=net,
+            groups=[
+                {
+                    "selector": "fc3[.].*",
+                    "lr": 0.1,
+                    "weight_decay": 0.01,
+                    "schedules": LinearSchedule(start_value=0.0, warmup_rate=0.2),
+                },
+                {"selector": "fc2[.]bias", "exclude": True},
+                {"selector": "", "lr": 0.0001, "weight_decay": 0.0},
+            ],
+            total_steps=10,
+        )