Updated with Experiments management

fernbach · fernbach · commit f9611a185154 · 2019-12-30T18:03:50.000Z
diff --git a/2_Using_Pipemode_input_for_big_datasets.ipynb b/2_Using_Pipemode_input_for_big_datasets.ipynb
@@ -75,7 +75,7 @@
    "metadata": {},
    "source": [
     "Run the previous job, this time use the new script (cifar10_keras_pipe.py)\n",
-    "Run the job for 10 epochs and configure it with `input_mode='Pipe'`"
+    "Run the job for 20 epochs and configure it with `input_mode='Pipe'`"
    ]
   },
   {
@@ -93,6 +93,40 @@
     "role = get_execution_role()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load the SageMaker experiment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from smexperiments.experiment import Experiment\n",
+    "from smexperiments.trial import Trial\n",
+    "import time\n",
+    "cifar10_experiment = Experiment.load(\n",
+    "    experiment_name=\"TensorFlow-cifar10-experiment\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a new trial\n",
+    "trial_name = f\"cifar10-training-job-pipemode-{int(time.time())}\"\n",
+    "trial = Trial.create(\n",
+    "    trial_name=trial_name, \n",
+    "    experiment_name=cifar10_experiment.experiment_name\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -105,13 +139,70 @@
     "estimator = ... "
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Connect the trial configured above to the job. add the experiment config to the fit function.\n",
+    "```python\n",
+    "experiment_config={\n",
+    "                  \"ExperimentName\": cifar10_experiment.experiment_name, \n",
+    "                  \"TrialName\": trial.trial_name,\n",
+    "                  \"TrialComponentDisplayName\": \"Training\"}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "estimator.fit({'train' :  'train_data_location',\n",
+    "               'validation' :  'validation_data_location',\n",
+    "               'eval' :  'eval_data_location'},\n",
+    "             experiment_config=)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Analyze the experiments"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "estimator.fit()"
+    "search_expression = {\n",
+    "    \"Filters\":[\n",
+    "        {\n",
+    "            \"Name\": \"DisplayName\",\n",
+    "            \"Operator\": \"Equals\",\n",
+    "            \"Value\": \"Training\",\n",
+    "        }\n",
+    "    ],\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.analytics import ExperimentAnalytics\n",
+    "trial_component_analytics = ExperimentAnalytics(\n",
+    "    sagemaker_session=sagemaker_session, \n",
+    "    experiment_name=cifar10_experiment.experiment_name,\n",
+    "    search_expression=search_expression\n",
+    ")\n",
+    "\n",
+    "table = trial_component_analytics.dataframe(force_refresh=True)\n",
+    "display(table)"
    ]
   },
   {
@@ -144,5 +235,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/3_Distributed_training_with_Horovod.ipynb b/3_Distributed_training_with_Horovod.ipynb
@@ -35,17 +35,19 @@
     "### Configure callbacks\n",
     "add the following callbacks:\n",
     "```python\n",
-    "    callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))\n",
-    "    callbacks.append(hvd.callbacks.MetricAverageCallback())\n",
-    "    callbacks.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1))\n",
+    "hvdBroadcast = hvd.callbacks.BroadcastGlobalVariablesCallback(0)\n",
+    "hvdMetricAverage = hvd.callbacks.MetricAverageCallback()\n",
+    "hvdLearningRate = hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)\n",
     "```\n",
     "\n",
-    "change the checkpointd and tensorboard callback to run only on `hvd.rank() == o` (You want only a single process the send logs)\n",
+    "change the checkpoint and tensorboard callback to run only on `hvd.rank() == o` (You want only a single process the send logs)\n",
     "```python\n",
-    "    if hvd.rank() == 0:\n",
-    "        callbacks.append(ModelCheckpoint(args.output_dir + '/checkpoint-{epoch}.h5'))\n",
-    "        callbacks.append(TensorBoard(log_dir=args.model_dir,update_freq='epoch'))\n",
+    "callbacks = [hvdBroadcast,hvdMetricAverage,hvdLearningRate]\n",
+    "if hvd.rank() == 0:\n",
+    "    callbacks.append(checkpoint)\n",
+    "    callbacks.append(tb_callback)\n",
     "```\n",
+    "update model.fit (don't update model.evaluate) to use the new callbacks list\n",
     "\n",
     "### Configure the optimizer\n",
     "in\n",
@@ -101,6 +103,40 @@
     "role = get_execution_role()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load the SageMaker experiment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from smexperiments.experiment import Experiment\n",
+    "from smexperiments.trial import Trial\n",
+    "import time\n",
+    "cifar10_experiment = Experiment.load(\n",
+    "    experiment_name=\"TensorFlow-cifar10-experiment\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a new trial\n",
+    "trial_name = f\"cifar10-training-job-distributed-{int(time.time())}\"\n",
+    "trial = Trial.create(\n",
+    "    trial_name=trial_name, \n",
+    "    experiment_name=cifar10_experiment.experiment_name\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -112,13 +148,70 @@
     "estimator = ... "
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Connect the trial configured above to the job. add the experiment config to the fit function.\n",
+    "```python\n",
+    "experiment_config={\n",
+    "                  \"ExperimentName\": cifar10_experiment.experiment_name, \n",
+    "                  \"TrialName\": trial.trial_name,\n",
+    "                  \"TrialComponentDisplayName\": \"Training\"}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "estimator.fit({'train' :  'train_data_location',\n",
+    "               'validation' :  'validation_data_location',\n",
+    "               'eval' :  'eval_data_location'},\n",
+    "             experiment_config=)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Analyze the experiments"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "estimator.fit()"
+    "search_expression = {\n",
+    "    \"Filters\":[\n",
+    "        {\n",
+    "            \"Name\": \"DisplayName\",\n",
+    "            \"Operator\": \"Equals\",\n",
+    "            \"Value\": \"Training\",\n",
+    "        }\n",
+    "    ],\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.analytics import ExperimentAnalytics\n",
+    "trial_component_analytics = ExperimentAnalytics(\n",
+    "    sagemaker_session=sagemaker_session, \n",
+    "    experiment_name=cifar10_experiment.experiment_name,\n",
+    "    search_expression=search_expression\n",
+    ")\n",
+    "\n",
+    "table = trial_component_analytics.dataframe(force_refresh=True)\n",
+    "display(table)"
    ]
   },
   {
@@ -154,5 +247,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }