Skip to content

Commit f9611a1

Browse files
committed
Updated with Experiments management
1 parent 0dc1a00 commit f9611a1

File tree

2 files changed

+196
-12
lines changed

2 files changed

+196
-12
lines changed

2_Using_Pipemode_input_for_big_datasets.ipynb

Lines changed: 94 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575
"metadata": {},
7676
"source": [
7777
"Run the previous job, this time use the new script (cifar10_keras_pipe.py)\n",
78-
"Run the job for 10 epochs and configure it with `input_mode='Pipe'`"
78+
"Run the job for 20 epochs and configure it with `input_mode='Pipe'`"
7979
]
8080
},
8181
{
@@ -93,6 +93,40 @@
9393
"role = get_execution_role()"
9494
]
9595
},
96+
{
97+
"cell_type": "markdown",
98+
"metadata": {},
99+
"source": [
100+
"### Load the SageMaker experiment"
101+
]
102+
},
103+
{
104+
"cell_type": "code",
105+
"execution_count": null,
106+
"metadata": {},
107+
"outputs": [],
108+
"source": [
109+
"from smexperiments.experiment import Experiment\n",
110+
"from smexperiments.trial import Trial\n",
111+
"import time\n",
112+
"cifar10_experiment = Experiment.load(\n",
113+
" experiment_name=\"TensorFlow-cifar10-experiment\")"
114+
]
115+
},
116+
{
117+
"cell_type": "code",
118+
"execution_count": null,
119+
"metadata": {},
120+
"outputs": [],
121+
"source": [
122+
"# create a new trial\n",
123+
"trial_name = f\"cifar10-training-job-pipemode-{int(time.time())}\"\n",
124+
"trial = Trial.create(\n",
125+
" trial_name=trial_name, \n",
126+
" experiment_name=cifar10_experiment.experiment_name\n",
127+
")"
128+
]
129+
},
96130
{
97131
"cell_type": "code",
98132
"execution_count": null,
@@ -105,13 +139,70 @@
105139
"estimator = ... "
106140
]
107141
},
142+
{
143+
"cell_type": "markdown",
144+
"metadata": {},
145+
"source": [
146+
"Connect the trial configured above to the job. add the experiment config to the fit function.\n",
147+
"```python\n",
148+
"experiment_config={\n",
149+
" \"ExperimentName\": cifar10_experiment.experiment_name, \n",
150+
" \"TrialName\": trial.trial_name,\n",
151+
" \"TrialComponentDisplayName\": \"Training\"}\n",
152+
"```"
153+
]
154+
},
155+
{
156+
"cell_type": "code",
157+
"execution_count": null,
158+
"metadata": {},
159+
"outputs": [],
160+
"source": [
161+
"estimator.fit({'train' : 'train_data_location',\n",
162+
" 'validation' : 'validation_data_location',\n",
163+
" 'eval' : 'eval_data_location'},\n",
164+
" experiment_config=)"
165+
]
166+
},
167+
{
168+
"cell_type": "markdown",
169+
"metadata": {},
170+
"source": [
171+
"### Analyze the experiments"
172+
]
173+
},
108174
{
109175
"cell_type": "code",
110176
"execution_count": null,
111177
"metadata": {},
112178
"outputs": [],
113179
"source": [
114-
"estimator.fit()"
180+
"search_expression = {\n",
181+
" \"Filters\":[\n",
182+
" {\n",
183+
" \"Name\": \"DisplayName\",\n",
184+
" \"Operator\": \"Equals\",\n",
185+
" \"Value\": \"Training\",\n",
186+
" }\n",
187+
" ],\n",
188+
"}"
189+
]
190+
},
191+
{
192+
"cell_type": "code",
193+
"execution_count": null,
194+
"metadata": {},
195+
"outputs": [],
196+
"source": [
197+
"from sagemaker.analytics import ExperimentAnalytics\n",
198+
"trial_component_analytics = ExperimentAnalytics(\n",
199+
" sagemaker_session=sagemaker_session, \n",
200+
" experiment_name=cifar10_experiment.experiment_name,\n",
201+
" search_expression=search_expression\n",
202+
")\n",
203+
"\n",
204+
"table = trial_component_analytics.dataframe(force_refresh=True)\n",
205+
"display(table)"
115206
]
116207
},
117208
{
@@ -144,5 +235,5 @@
144235
}
145236
},
146237
"nbformat": 4,
147-
"nbformat_minor": 2
238+
"nbformat_minor": 4
148239
}

3_Distributed_training_with_Horovod.ipynb

Lines changed: 102 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,19 @@
3535
"### Configure callbacks\n",
3636
"add the following callbacks:\n",
3737
"```python\n",
38-
" callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))\n",
39-
" callbacks.append(hvd.callbacks.MetricAverageCallback())\n",
40-
" callbacks.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1))\n",
38+
"hvdBroadcast = hvd.callbacks.BroadcastGlobalVariablesCallback(0)\n",
39+
"hvdMetricAverage = hvd.callbacks.MetricAverageCallback()\n",
40+
"hvdLearningRate = hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)\n",
4141
"```\n",
4242
"\n",
43-
"change the checkpointd and tensorboard callback to run only on `hvd.rank() == o` (You want only a single process the send logs)\n",
43+
"change the checkpoint and tensorboard callback to run only on `hvd.rank() == o` (You want only a single process the send logs)\n",
4444
"```python\n",
45-
" if hvd.rank() == 0:\n",
46-
" callbacks.append(ModelCheckpoint(args.output_dir + '/checkpoint-{epoch}.h5'))\n",
47-
" callbacks.append(TensorBoard(log_dir=args.model_dir,update_freq='epoch'))\n",
45+
"callbacks = [hvdBroadcast,hvdMetricAverage,hvdLearningRate]\n",
46+
"if hvd.rank() == 0:\n",
47+
" callbacks.append(checkpoint)\n",
48+
" callbacks.append(tb_callback)\n",
4849
"```\n",
50+
"update model.fit (don't update model.evaluate) to use the new callbacks list\n",
4951
"\n",
5052
"### Configure the optimizer\n",
5153
"in\n",
@@ -101,6 +103,40 @@
101103
"role = get_execution_role()"
102104
]
103105
},
106+
{
107+
"cell_type": "markdown",
108+
"metadata": {},
109+
"source": [
110+
"### Load the SageMaker experiment"
111+
]
112+
},
113+
{
114+
"cell_type": "code",
115+
"execution_count": null,
116+
"metadata": {},
117+
"outputs": [],
118+
"source": [
119+
"from smexperiments.experiment import Experiment\n",
120+
"from smexperiments.trial import Trial\n",
121+
"import time\n",
122+
"cifar10_experiment = Experiment.load(\n",
123+
" experiment_name=\"TensorFlow-cifar10-experiment\")"
124+
]
125+
},
126+
{
127+
"cell_type": "code",
128+
"execution_count": null,
129+
"metadata": {},
130+
"outputs": [],
131+
"source": [
132+
"# create a new trial\n",
133+
"trial_name = f\"cifar10-training-job-distributed-{int(time.time())}\"\n",
134+
"trial = Trial.create(\n",
135+
" trial_name=trial_name, \n",
136+
" experiment_name=cifar10_experiment.experiment_name\n",
137+
")"
138+
]
139+
},
104140
{
105141
"cell_type": "code",
106142
"execution_count": null,
@@ -112,13 +148,70 @@
112148
"estimator = ... "
113149
]
114150
},
151+
{
152+
"cell_type": "markdown",
153+
"metadata": {},
154+
"source": [
155+
"Connect the trial configured above to the job. add the experiment config to the fit function.\n",
156+
"```python\n",
157+
"experiment_config={\n",
158+
" \"ExperimentName\": cifar10_experiment.experiment_name, \n",
159+
" \"TrialName\": trial.trial_name,\n",
160+
" \"TrialComponentDisplayName\": \"Training\"}\n",
161+
"```"
162+
]
163+
},
164+
{
165+
"cell_type": "code",
166+
"execution_count": null,
167+
"metadata": {},
168+
"outputs": [],
169+
"source": [
170+
"estimator.fit({'train' : 'train_data_location',\n",
171+
" 'validation' : 'validation_data_location',\n",
172+
" 'eval' : 'eval_data_location'},\n",
173+
" experiment_config=)"
174+
]
175+
},
176+
{
177+
"cell_type": "markdown",
178+
"metadata": {},
179+
"source": [
180+
"### Analyze the experiments"
181+
]
182+
},
115183
{
116184
"cell_type": "code",
117185
"execution_count": null,
118186
"metadata": {},
119187
"outputs": [],
120188
"source": [
121-
"estimator.fit()"
189+
"search_expression = {\n",
190+
" \"Filters\":[\n",
191+
" {\n",
192+
" \"Name\": \"DisplayName\",\n",
193+
" \"Operator\": \"Equals\",\n",
194+
" \"Value\": \"Training\",\n",
195+
" }\n",
196+
" ],\n",
197+
"}"
198+
]
199+
},
200+
{
201+
"cell_type": "code",
202+
"execution_count": null,
203+
"metadata": {},
204+
"outputs": [],
205+
"source": [
206+
"from sagemaker.analytics import ExperimentAnalytics\n",
207+
"trial_component_analytics = ExperimentAnalytics(\n",
208+
" sagemaker_session=sagemaker_session, \n",
209+
" experiment_name=cifar10_experiment.experiment_name,\n",
210+
" search_expression=search_expression\n",
211+
")\n",
212+
"\n",
213+
"table = trial_component_analytics.dataframe(force_refresh=True)\n",
214+
"display(table)"
122215
]
123216
},
124217
{
@@ -154,5 +247,5 @@
154247
}
155248
},
156249
"nbformat": 4,
157-
"nbformat_minor": 2
250+
"nbformat_minor": 4
158251
}

0 commit comments

Comments
 (0)