|
35 | 35 | "### Configure callbacks\n", |
36 | 36 | "add the following callbacks:\n", |
37 | 37 | "```python\n", |
38 | | - " callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))\n", |
39 | | - " callbacks.append(hvd.callbacks.MetricAverageCallback())\n", |
40 | | - " callbacks.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1))\n", |
| 38 | + "hvdBroadcast = hvd.callbacks.BroadcastGlobalVariablesCallback(0)\n", |
| 39 | + "hvdMetricAverage = hvd.callbacks.MetricAverageCallback()\n", |
| 40 | + "hvdLearningRate = hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)\n", |
41 | 41 | "```\n", |
42 | 42 | "\n", |
43 | | - "change the checkpointd and tensorboard callback to run only on `hvd.rank() == o` (You want only a single process the send logs)\n", |
| 43 | + "change the checkpoint and tensorboard callback to run only on `hvd.rank() == o` (You want only a single process the send logs)\n", |
44 | 44 | "```python\n", |
45 | | - " if hvd.rank() == 0:\n", |
46 | | - " callbacks.append(ModelCheckpoint(args.output_dir + '/checkpoint-{epoch}.h5'))\n", |
47 | | - " callbacks.append(TensorBoard(log_dir=args.model_dir,update_freq='epoch'))\n", |
| 45 | + "callbacks = [hvdBroadcast,hvdMetricAverage,hvdLearningRate]\n", |
| 46 | + "if hvd.rank() == 0:\n", |
| 47 | + " callbacks.append(checkpoint)\n", |
| 48 | + " callbacks.append(tb_callback)\n", |
48 | 49 | "```\n", |
| 50 | + "update model.fit (don't update model.evaluate) to use the new callbacks list\n", |
49 | 51 | "\n", |
50 | 52 | "### Configure the optimizer\n", |
51 | 53 | "in\n", |
|
101 | 103 | "role = get_execution_role()" |
102 | 104 | ] |
103 | 105 | }, |
| 106 | + { |
| 107 | + "cell_type": "markdown", |
| 108 | + "metadata": {}, |
| 109 | + "source": [ |
| 110 | + "### Load the SageMaker experiment" |
| 111 | + ] |
| 112 | + }, |
| 113 | + { |
| 114 | + "cell_type": "code", |
| 115 | + "execution_count": null, |
| 116 | + "metadata": {}, |
| 117 | + "outputs": [], |
| 118 | + "source": [ |
| 119 | + "from smexperiments.experiment import Experiment\n", |
| 120 | + "from smexperiments.trial import Trial\n", |
| 121 | + "import time\n", |
| 122 | + "cifar10_experiment = Experiment.load(\n", |
| 123 | + " experiment_name=\"TensorFlow-cifar10-experiment\")" |
| 124 | + ] |
| 125 | + }, |
| 126 | + { |
| 127 | + "cell_type": "code", |
| 128 | + "execution_count": null, |
| 129 | + "metadata": {}, |
| 130 | + "outputs": [], |
| 131 | + "source": [ |
| 132 | + "# create a new trial\n", |
| 133 | + "trial_name = f\"cifar10-training-job-distributed-{int(time.time())}\"\n", |
| 134 | + "trial = Trial.create(\n", |
| 135 | + " trial_name=trial_name, \n", |
| 136 | + " experiment_name=cifar10_experiment.experiment_name\n", |
| 137 | + ")" |
| 138 | + ] |
| 139 | + }, |
104 | 140 | { |
105 | 141 | "cell_type": "code", |
106 | 142 | "execution_count": null, |
|
112 | 148 | "estimator = ... " |
113 | 149 | ] |
114 | 150 | }, |
| 151 | + { |
| 152 | + "cell_type": "markdown", |
| 153 | + "metadata": {}, |
| 154 | + "source": [ |
| 155 | + "Connect the trial configured above to the job. add the experiment config to the fit function.\n", |
| 156 | + "```python\n", |
| 157 | + "experiment_config={\n", |
| 158 | + " \"ExperimentName\": cifar10_experiment.experiment_name, \n", |
| 159 | + " \"TrialName\": trial.trial_name,\n", |
| 160 | + " \"TrialComponentDisplayName\": \"Training\"}\n", |
| 161 | + "```" |
| 162 | + ] |
| 163 | + }, |
| 164 | + { |
| 165 | + "cell_type": "code", |
| 166 | + "execution_count": null, |
| 167 | + "metadata": {}, |
| 168 | + "outputs": [], |
| 169 | + "source": [ |
| 170 | + "estimator.fit({'train' : 'train_data_location',\n", |
| 171 | + " 'validation' : 'validation_data_location',\n", |
| 172 | + " 'eval' : 'eval_data_location'},\n", |
| 173 | + " experiment_config=)" |
| 174 | + ] |
| 175 | + }, |
| 176 | + { |
| 177 | + "cell_type": "markdown", |
| 178 | + "metadata": {}, |
| 179 | + "source": [ |
| 180 | + "### Analyze the experiments" |
| 181 | + ] |
| 182 | + }, |
115 | 183 | { |
116 | 184 | "cell_type": "code", |
117 | 185 | "execution_count": null, |
118 | 186 | "metadata": {}, |
119 | 187 | "outputs": [], |
120 | 188 | "source": [ |
121 | | - "estimator.fit()" |
| 189 | + "search_expression = {\n", |
| 190 | + " \"Filters\":[\n", |
| 191 | + " {\n", |
| 192 | + " \"Name\": \"DisplayName\",\n", |
| 193 | + " \"Operator\": \"Equals\",\n", |
| 194 | + " \"Value\": \"Training\",\n", |
| 195 | + " }\n", |
| 196 | + " ],\n", |
| 197 | + "}" |
| 198 | + ] |
| 199 | + }, |
| 200 | + { |
| 201 | + "cell_type": "code", |
| 202 | + "execution_count": null, |
| 203 | + "metadata": {}, |
| 204 | + "outputs": [], |
| 205 | + "source": [ |
| 206 | + "from sagemaker.analytics import ExperimentAnalytics\n", |
| 207 | + "trial_component_analytics = ExperimentAnalytics(\n", |
| 208 | + " sagemaker_session=sagemaker_session, \n", |
| 209 | + " experiment_name=cifar10_experiment.experiment_name,\n", |
| 210 | + " search_expression=search_expression\n", |
| 211 | + ")\n", |
| 212 | + "\n", |
| 213 | + "table = trial_component_analytics.dataframe(force_refresh=True)\n", |
| 214 | + "display(table)" |
122 | 215 | ] |
123 | 216 | }, |
124 | 217 | { |
|
154 | 247 | } |
155 | 248 | }, |
156 | 249 | "nbformat": 4, |
157 | | - "nbformat_minor": 2 |
| 250 | + "nbformat_minor": 4 |
158 | 251 | } |
0 commit comments