Skip to content

Commit 470bc7d

Browse files
authored
SDK v2 and CLI changes for distributed training for automl tabular task types. (Azure#29341)
* Initial draft. * Initial draft. * Fix imports. * Add the two flags and enum. * Add tests for limit settings- max_nodes. * Add unittests for forecasting job. * Add unittests for classification jobs. * Add unittests for regression job. * Add tests for CLI schema. * Fix missing new lines in yaml files. * Add documentation. * Fix formatting issues. * Set max_nodes to 1 by default. * Fix constant import formatting. * Fix code review comments. * Fix formatting issue. * Redefine constants with proper documentation. * Fix imports. * Fix documentation.
1 parent 34273f7 commit 470bc7d

26 files changed

+1594
-31
lines changed

sdk/ml/azure-ai-ml/azure/ai/ml/_schema/automl/table_vertical/table_vertical_limit_settings.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from marshmallow import fields, post_load
88

9+
from azure.ai.ml._schema import ExperimentalField
910
from azure.ai.ml._schema.core.schema import PatchedSchemaMeta
1011
from azure.ai.ml.constants._job.automl import AutoMLConstants
1112

@@ -15,6 +16,7 @@ class AutoMLTableLimitsSchema(metaclass=PatchedSchemaMeta):
1516
exit_score = fields.Float()
1617
max_concurrent_trials = fields.Int()
1718
max_cores_per_trial = fields.Int()
19+
max_nodes = ExperimentalField(fields.Int())
1820
max_trials = fields.Int(data_key=AutoMLConstants.MAX_TRIALS_YAML)
1921
timeout_minutes = fields.Int() # type duration
2022
trial_timeout_minutes = fields.Int() # type duration

sdk/ml/azure-ai-ml/azure/ai/ml/_schema/automl/training_settings.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
RegressionModels,
1313
StackMetaLearnerType,
1414
)
15+
from azure.ai.ml.constants import TabularTrainingMode
16+
from azure.ai.ml._schema import ExperimentalField
1517
from azure.ai.ml._schema.core.fields import NestedField, StringTransformedEnum
1618
from azure.ai.ml._schema.core.schema import PatchedSchemaMeta
1719
from azure.ai.ml._utils.utils import camel_to_snake
@@ -49,6 +51,12 @@ class TrainingSettingsSchema(metaclass=PatchedSchemaMeta):
4951
enable_vote_ensemble = fields.Bool()
5052
ensemble_model_download_timeout = fields.Int(data_key=AutoMLConstants.ENSEMBLE_MODEL_DOWNLOAD_TIMEOUT_YAML)
5153
stack_ensemble_settings = NestedField(StackEnsembleSettingsSchema())
54+
training_mode = ExperimentalField(
55+
StringTransformedEnum(
56+
allowed_values=[o.value for o in TabularTrainingMode],
57+
casing_transform=camel_to_snake,
58+
)
59+
)
5260

5361

5462
class ClassificationTrainingSettingsSchema(TrainingSettingsSchema):

sdk/ml/azure-ai-ml/azure/ai/ml/constants/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,13 @@
1717
JobType,
1818
NlpLearningRateScheduler,
1919
NlpModels,
20+
TabularTrainingMode,
2021
)
2122
from ._registry import AcrAccountSku, StorageAccountType
2223
from ._workspace import ManagedServiceIdentityType
2324

25+
TabularTrainingMode.__module__ = __name__
26+
2427
__all__ = [
2528
"ImportSourceType",
2629
"JobType",
@@ -40,4 +43,5 @@
4043
"NlpModels",
4144
"NlpLearningRateScheduler",
4245
"Scope",
46+
"TabularTrainingMode",
4347
]

sdk/ml/azure-ai-ml/azure/ai/ml/constants/_job/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
ImageObjectDetectionModelNames,
1313
NlpLearningRateScheduler,
1414
NlpModels,
15+
TabularTrainingMode,
1516
)
1617
from .job import DistributionType, ImportSourceType, JobType
1718
from .pipeline import PipelineConstants
@@ -30,4 +31,5 @@
3031
"SearchSpace",
3132
"NlpModels",
3233
"NlpLearningRateScheduler",
34+
"TabularTrainingMode",
3335
]

sdk/ml/azure-ai-ml/azure/ai/ml/constants/_job/automl.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
# pylint: disable=unused-import
77
from azure.ai.ml._restclient.v2022_10_01_preview.models import NlpLearningRateScheduler
8+
from azure.ai.ml._restclient.v2023_02_01_preview.models import TrainingMode
9+
from azure.ai.ml._utils._experimental import experimental
810

911

1012
class AutoMLConstants:
@@ -107,3 +109,8 @@ class NlpModels(Enum):
107109
XLM_ROBERTA_LARGE = "xlm-roberta-large"
108110
XLNET_BASE_CASED = "xlnet-base-cased"
109111
XLNET_LARGE_CASED = "xlnet-large-cased"
112+
113+
114+
TrainingMode.__doc__ = "Mode to enable/disable distributed training."
115+
TabularTrainingMode = experimental(TrainingMode)
116+
TabularTrainingMode.__name__ = "TabularTrainingMode"

sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/automl/tabular/automl_tabular.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
LogVerbosity,
1515
)
1616
from azure.ai.ml._utils.utils import camel_to_snake
17+
from azure.ai.ml.constants import TabularTrainingMode
1718
from azure.ai.ml.constants._job.automl import AutoMLConstants
1819
from azure.ai.ml.entities._inputs_outputs import Input
1920
from azure.ai.ml.entities._job.automl.automl_vertical import AutoMLVertical
@@ -245,6 +246,7 @@ def set_limits(
245246
exit_score: Optional[float] = None,
246247
max_concurrent_trials: Optional[int] = None,
247248
max_cores_per_trial: Optional[int] = None,
249+
max_nodes: Optional[int] = None,
248250
max_trials: Optional[int] = None,
249251
timeout_minutes: Optional[int] = None,
250252
trial_timeout_minutes: Optional[int] = None,
@@ -301,6 +303,14 @@ def set_limits(
301303
302304
* Equal to 1, the default.
303305
:paramtype max_cores_per_trial: typing.Optional[int]
306+
:keyword max_nodes: [Experimental] The maximum number of nodes to use for distributed training.
307+
308+
* For forecasting, each model is trained using max(2, int(max_nodes / max_concurrent_trials)) nodes.
309+
310+
* For classification/regression, each model is trained using max_nodes nodes.
311+
312+
Note- This parameter is in public preview and might change in future.
313+
:paramtype max_nodes: typing.Optional[int]
304314
:keyword max_trials: The total number of different algorithm and parameter combinations to test during an
305315
automated ML experiment. If not specified, the default is 1000 iterations.
306316
:paramtype max_trials: typing.Optional[int]
@@ -324,6 +334,7 @@ def set_limits(
324334
self._limits.max_cores_per_trial = (
325335
max_cores_per_trial if max_cores_per_trial is not None else self._limits.max_cores_per_trial
326336
)
337+
self._limits.max_nodes = max_nodes if max_nodes is not None else self._limits.max_nodes
327338
self._limits.max_trials = max_trials if max_trials is not None else self._limits.max_trials
328339
self._limits.timeout_minutes = timeout_minutes if timeout_minutes is not None else self._limits.timeout_minutes
329340
self._limits.trial_timeout_minutes = (
@@ -342,6 +353,7 @@ def set_training(
342353
ensemble_model_download_timeout: Optional[int] = None,
343354
allowed_training_algorithms: Optional[List[str]] = None,
344355
blocked_training_algorithms: Optional[List[str]] = None,
356+
training_mode: Optional[Union[str, TabularTrainingMode]] = None,
345357
) -> None:
346358
"""The method to configure training related settings.
347359
@@ -383,6 +395,17 @@ def set_training(
383395
:paramtype allowed_training_algorithms: typing.Optional[List[str]]
384396
:keyword blocked_training_algorithms: A list of algorithms to ignore for an experiment, defaults to None
385397
:paramtype blocked_training_algorithms: typing.Optional[List[str]]
398+
:keyword training_mode: [Experimental] The training mode to use.
399+
The possible values are-
400+
401+
* distributed- enables distributed training for supported algorithms.
402+
403+
* non_distributed- disables distributed training.
404+
405+
* auto- Currently, it is same as non_distributed. In future, this might change.
406+
407+
Note: This parameter is in public preview and may change in future.
408+
:paramtype training_mode: typing.Optional[typing.Union[str, azure.ai.ml.constants.TabularTrainingMode]]
386409
"""
387410
# get training object by calling training getter of respective tabular task
388411
self._training = self.training
@@ -417,6 +440,7 @@ def set_training(
417440

418441
self._training.allowed_training_algorithms = allowed_training_algorithms
419442
self._training.blocked_training_algorithms = blocked_training_algorithms
443+
self._training.training_mode = training_mode if training_mode is not None else self._training.training_mode
420444

421445
def set_featurization(
422446
self,

sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/automl/tabular/forecasting_job.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from azure.ai.ml._restclient.v2023_02_01_preview.models import Forecasting as RestForecasting
1111
from azure.ai.ml._restclient.v2023_02_01_preview.models import ForecastingPrimaryMetrics, JobBase, TaskType
1212
from azure.ai.ml._utils.utils import camel_to_snake, is_data_binding_expression
13+
from azure.ai.ml.constants import TabularTrainingMode
1314
from azure.ai.ml.constants._common import BASE_PATH_CONTEXT_KEY
1415
from azure.ai.ml.constants._job.automl import AutoMLConstants
1516
from azure.ai.ml.entities._credentials import _BaseJobIdentityConfiguration
@@ -351,6 +352,7 @@ def set_training(
351352
ensemble_model_download_timeout: Optional[int] = None,
352353
allowed_training_algorithms: Optional[List[str]] = None,
353354
blocked_training_algorithms: Optional[List[str]] = None,
355+
training_mode: Optional[Union[str, TabularTrainingMode]] = None,
354356
) -> None:
355357
super().set_training(
356358
enable_onnx_compatible_models=enable_onnx_compatible_models,
@@ -362,6 +364,7 @@ def set_training(
362364
ensemble_model_download_timeout=ensemble_model_download_timeout,
363365
allowed_training_algorithms=allowed_training_algorithms,
364366
blocked_training_algorithms=blocked_training_algorithms,
367+
training_mode=training_mode,
365368
)
366369

367370
# Disable stack ensemble by default, since it is currently not supported for forecasting tasks

sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/automl/tabular/limit_settings.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,14 @@ class TabularLimitSettings(RestTranslatableMixin):
2121
:type max_concurrent_trials: int, optional
2222
:param max_cores_per_trial: The maximum number of threads to use for a given training iteration.
2323
:type max_cores_per_trial: int, optional
24+
:param max_nodes: [Experimental] The maximum number of nodes to use for distributed training.
25+
26+
* For forecasting, each model is trained using max(2, int(max_nodes / max_concurrent_trials)) nodes.
27+
28+
* For classification/regression, each model is trained using max_nodes nodes.
29+
30+
Note- This parameter is in public preview and might change in future.
31+
:type max_nodes: int, optional
2432
:param max_trials: Maximum number of AutoML iterations.
2533
:type max_trials: int, optional
2634
:param timeout_minutes: AutoML job timeout.
@@ -36,6 +44,7 @@ def __init__(
3644
exit_score: Optional[float] = None,
3745
max_concurrent_trials: Optional[int] = None,
3846
max_cores_per_trial: Optional[int] = None,
47+
max_nodes: Optional[int] = None,
3948
max_trials: Optional[int] = None,
4049
timeout_minutes: Optional[int] = None,
4150
trial_timeout_minutes: Optional[int] = None,
@@ -44,6 +53,7 @@ def __init__(
4453
self.exit_score = exit_score
4554
self.max_concurrent_trials = max_concurrent_trials
4655
self.max_cores_per_trial = max_cores_per_trial
56+
self.max_nodes = max_nodes
4757
self.max_trials = max_trials
4858
self.timeout_minutes = timeout_minutes
4959
self.trial_timeout_minutes = trial_timeout_minutes
@@ -54,6 +64,7 @@ def _to_rest_object(self) -> RestTabularLimitSettings:
5464
exit_score=self.exit_score,
5565
max_concurrent_trials=self.max_concurrent_trials,
5666
max_cores_per_trial=self.max_cores_per_trial,
67+
max_nodes=self.max_nodes,
5768
max_trials=self.max_trials,
5869
timeout=to_iso_duration_format_mins(self.timeout_minutes),
5970
trial_timeout=to_iso_duration_format_mins(self.trial_timeout_minutes),
@@ -66,6 +77,7 @@ def _from_rest_object(cls, obj: RestTabularLimitSettings) -> "TabularLimitSettin
6677
exit_score=obj.exit_score,
6778
max_concurrent_trials=obj.max_concurrent_trials,
6879
max_cores_per_trial=obj.max_cores_per_trial,
80+
max_nodes=obj.max_nodes,
6981
max_trials=obj.max_trials,
7082
timeout_minutes=from_iso_duration_format_mins(obj.timeout),
7183
trial_timeout_minutes=from_iso_duration_format_mins(obj.trial_timeout),
@@ -79,6 +91,7 @@ def __eq__(self, other: object) -> bool:
7991
and self.exit_score == other.exit_score
8092
and self.max_concurrent_trials == other.max_concurrent_trials
8193
and self.max_cores_per_trial == other.max_cores_per_trial
94+
and self.max_nodes == other.max_nodes
8295
and self.max_trials == other.max_trials
8396
and self.timeout_minutes == other.timeout_minutes
8497
and self.trial_timeout_minutes == other.trial_timeout_minutes

sdk/ml/azure-ai-ml/azure/ai/ml/entities/_job/automl/training_settings.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,12 @@
1919
RegressionTrainingSettings as RestRegressionTrainingSettings,
2020
)
2121
from azure.ai.ml._restclient.v2023_02_01_preview.models import TrainingSettings as RestTrainingSettings
22+
from azure.ai.ml._utils._experimental import experimental
2223
from azure.ai.ml._utils.utils import camel_to_snake, from_iso_duration_format_mins, to_iso_duration_format_mins
24+
from azure.ai.ml.constants import TabularTrainingMode
2325
from azure.ai.ml.entities._job.automl.stack_ensemble_settings import StackEnsembleSettings
2426
from azure.ai.ml.entities._mixins import RestTranslatableMixin
27+
from azure.ai.ml.exceptions import ErrorCategory, ErrorTarget, ValidationException
2528

2629

2730
class TrainingSettings(RestTranslatableMixin):
@@ -39,6 +42,7 @@ def __init__(
3942
ensemble_model_download_timeout: Optional[int] = None,
4043
allowed_training_algorithms: Optional[List[str]] = None,
4144
blocked_training_algorithms: Optional[List[str]] = None,
45+
training_mode: Optional[Union[str, TabularTrainingMode]] = None,
4246
):
4347
"""TrainingSettings class for Azure Machine Learning.
4448
@@ -51,6 +55,16 @@ def __init__(
5155
:param ensemble_model_download_timeout: Timeout for downloading ensemble models
5256
:param allowed_training_algorithms: Models to train
5357
:param blocked_training_algorithms: Models that will not be considered for training
58+
:param training_mode: [Experimental] The training mode to use.
59+
The possible values are-
60+
61+
* distributed- enables distributed training for supported algorithms.
62+
63+
* non_distributed- disables distributed training.
64+
65+
* auto- Currently, it is same as non_distributed. In future, this might change.
66+
67+
Note: This parameter is in public preview and may change in future.
5468
"""
5569
self.enable_onnx_compatible_models = enable_onnx_compatible_models
5670
self.enable_dnn_training = enable_dnn_training
@@ -61,6 +75,31 @@ def __init__(
6175
self.ensemble_model_download_timeout = ensemble_model_download_timeout
6276
self.allowed_training_algorithms = allowed_training_algorithms
6377
self.blocked_training_algorithms = blocked_training_algorithms
78+
self.training_mode = training_mode
79+
80+
@experimental
81+
@property
82+
def training_mode(self):
83+
return self._training_mode
84+
85+
@training_mode.setter
86+
def training_mode(self, value: Optional[Union[str, TabularTrainingMode]]):
87+
if value is None or value is TabularTrainingMode:
88+
self._training_mode = value
89+
elif hasattr(TabularTrainingMode, camel_to_snake(value).upper()):
90+
self._training_mode = TabularTrainingMode[camel_to_snake(value).upper()]
91+
else:
92+
supported_values = ", ".join([f'"{camel_to_snake(mode.value)}"' for mode in TabularTrainingMode])
93+
msg = (
94+
f"Unsupported training mode: {value}. Supported values are- {supported_values}. "
95+
"Or you can use azure.ai.ml.constants.TabularTrainingMode enum."
96+
)
97+
raise ValidationException(
98+
message=msg,
99+
no_personal_data_message=msg,
100+
target=ErrorTarget.AUTOML,
101+
error_category=ErrorCategory.USER_ERROR,
102+
)
64103

65104
@property
66105
def allowed_training_algorithms(self):
@@ -81,6 +120,7 @@ def _to_rest_object(self) -> RestTrainingSettings:
81120
if self.stack_ensemble_settings
82121
else None,
83122
ensemble_model_download_timeout=to_iso_duration_format_mins(self.ensemble_model_download_timeout),
123+
training_mode=self.training_mode,
84124
)
85125

86126
@classmethod
@@ -97,6 +137,7 @@ def _from_rest_object(cls, obj: RestTrainingSettings) -> "TrainingSettings":
97137
if obj.stack_ensemble_settings
98138
else None
99139
),
140+
training_mode=obj.training_mode,
100141
)
101142

102143
def __eq__(self, other: object) -> bool:
@@ -112,6 +153,7 @@ def __eq__(self, other: object) -> bool:
112153
and self.stack_ensemble_settings == other.stack_ensemble_settings
113154
and self.allowed_training_algorithms == other.allowed_training_algorithms
114155
and self.blocked_training_algorithms == other.blocked_training_algorithms
156+
and self.training_mode == other.training_mode
115157
)
116158

117159
def __ne__(self, other: object) -> bool:
@@ -162,6 +204,7 @@ def _to_rest_object(self) -> RestClassificationTrainingSettings:
162204
ensemble_model_download_timeout=to_iso_duration_format_mins(self.ensemble_model_download_timeout),
163205
allowed_training_algorithms=self.allowed_training_algorithms,
164206
blocked_training_algorithms=self.blocked_training_algorithms,
207+
training_mode=self.training_mode,
165208
)
166209

167210
@classmethod
@@ -176,6 +219,7 @@ def _from_rest_object(cls, obj: RestClassificationTrainingSettings) -> "Classifi
176219
stack_ensemble_settings=obj.stack_ensemble_settings,
177220
allowed_training_algorithms=obj.allowed_training_algorithms,
178221
blocked_training_algorithms=obj.blocked_training_algorithms,
222+
training_mode=obj.training_mode,
179223
)
180224

181225

@@ -211,6 +255,7 @@ def _to_rest_object(self) -> RestForecastingTrainingSettings:
211255
ensemble_model_download_timeout=to_iso_duration_format_mins(self.ensemble_model_download_timeout),
212256
allowed_training_algorithms=self.allowed_training_algorithms,
213257
blocked_training_algorithms=self.blocked_training_algorithms,
258+
training_mode=self.training_mode,
214259
)
215260

216261
@classmethod
@@ -225,6 +270,7 @@ def _from_rest_object(cls, obj: RestForecastingTrainingSettings) -> "Forecasting
225270
stack_ensemble_settings=obj.stack_ensemble_settings,
226271
allowed_training_algorithms=obj.allowed_training_algorithms,
227272
blocked_training_algorithms=obj.blocked_training_algorithms,
273+
training_mode=obj.training_mode,
228274
)
229275

230276

@@ -260,6 +306,7 @@ def _to_rest_object(self) -> RestRegressionTrainingSettings:
260306
ensemble_model_download_timeout=to_iso_duration_format_mins(self.ensemble_model_download_timeout),
261307
allowed_training_algorithms=self.allowed_training_algorithms,
262308
blocked_training_algorithms=self.blocked_training_algorithms,
309+
training_mode=self.training_mode,
263310
)
264311

265312
@classmethod
@@ -274,4 +321,5 @@ def _from_rest_object(cls, obj: RestRegressionTrainingSettings) -> "RegressionTr
274321
stack_ensemble_settings=obj.stack_ensemble_settings,
275322
allowed_training_algorithms=obj.allowed_training_algorithms,
276323
blocked_training_algorithms=obj.blocked_training_algorithms,
324+
training_mode=obj.training_mode,
277325
)

0 commit comments

Comments
 (0)