add createHostTransferableTensorWithOffset session option to create host transferable tensors with tile mapping balance

alexshuang · alexshuang · commit 0e887be6f0c5 · 2023-07-05T08:36:07.000Z
Summary: !ci_branch_mk2 T74481 encounter OOM on tile0 when there are a large number of small input tensors all mapping to tile0. This diff by accumulating the created tensor bytes as offset of createHostTransferableTensor() to rotate the next mapping start index to create mapping balanced input tensors. Reviewers: #popart, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, markk Reviewed By: #popart, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, markk Subscribers: leiw, matthewha, markk, hanzhao, alleng Maniphest Tasks: T74481 Differential Revision: https://phabricator.sourcevertex.net/D85103
diff --git a/python/popart/popart_core/popart.cpp b/python/popart/popart_core/popart.cpp
@@ -1356,6 +1356,9 @@ PYBIND11_MODULE(popart_core, m) {
     cls.def_readwrite(
         "_customTransformApplierSettings",
         &SessionOptions::ExperimentalSettings::customTransformApplierSettings);
+    cls.def_readwrite("createHostTransferableTensorWithOffset",
+                      &SessionOptions::ExperimentalSettings::
+                          createHostTransferableTensorWithOffset);
   }
   {
     py::class_<SessionOptions> cls(m, "SessionOptions");
diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt
@@ -47,6 +47,8 @@ add_unit_test(preplan_convolutions_test preplan_convolutions_test.cpp VARIANTS "
 add_unit_test(preplan_matmuls_test preplan_matmuls_test.cpp VARIANTS "IpuModel2")
 add_unit_test(rng_mapping_test rng_mapping_test.cpp)
 
+add_unit_test(inittensoroffsetmap_test inittensoroffsetmap_test.cpp VARIANTS Hw)
+
 # add_popart_py_unit_test(test_util) Utility
 # add_popart_py_unit_test(test_session) Utility
 
diff --git a/tests/integration/inittensoroffsetmap_test.cpp b/tests/integration/inittensoroffsetmap_test.cpp
@@ -0,0 +1,134 @@
+// Copyright (c) 2023 Graphcore Ltd. All rights reserved.
+#define BOOST_TEST_MODULE init_tensor_offset_map
+
+#include <algorithm>
+#include <any>
+#include <boost/test/unit_test.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include <poplar/Graph.hpp>
+#include <poplar/Interval.hpp>
+
+#include "popart/builder.gen.hpp"
+#include "popart/dataflow.hpp"
+#include "popart/debugcontext.hpp"
+#include "popart/inputshapeinfo.hpp"
+#include "popart/logging.hpp"
+#include "popart/names.hpp"
+#include "popart/patterns/patterns.hpp"
+#include "popart/sessionoptions.hpp"
+#include "popart/tensorinfo.hpp"
+#include "popart/util.hpp"
+#include "popart/voiddata.hpp"
+
+// This trick is required to access the Devicex's poplar::Tensors.
+
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wkeyword-macro"
+#endif
+#define protected public
+#define private public
+
+#include <testdevice.hpp>
+#include <popart/builder.hpp>
+#include <popart/devicemanager.hpp>
+#include <popart/error.hpp>
+#include <popart/popx/devicex.hpp>
+#include <popart/popx/irlowering.hpp>
+#include <popart/session.hpp>
+#include <popart/sgd.hpp>
+
+#include "popart/popx/poptensors.hpp"
+
+#undef private
+#undef protected
+
+BOOST_AUTO_TEST_CASE(InitTensorOffsetMap) {
+  // In this test, the input tensors are the exact size of a packet bytes for
+  // one tile, Therefore, when createHostTransferableTensorWithOffset = true,
+  // the accumulated tensor bytes is passed to createHostTransferableTensor()
+  // as offset, and it mapping those tensors across tiles rather than mapping
+  // them all to tile0.
+
+  using namespace popart;
+
+  auto builder = Builder::create();
+  auto aiOnnx  = builder->aiOnnxOpset9();
+
+  // one packet per tile = 1024 bytes = 256 * FLOAT
+  std::vector<int64_t> inputShape{1, 256};
+  TensorInfo inputInfo("FLOAT", inputShape);
+
+  auto a = builder->addInputTensor(
+      inputInfo, {TileSet::IO, ExchangeStrategy::OverlapInnerLoop});
+  auto b = builder->addInputTensor(
+      inputInfo, {TileSet::IO, ExchangeStrategy::OverlapInnerLoop});
+  auto c = builder->addInputTensor(
+      inputInfo, {TileSet::IO, ExchangeStrategy::OverlapInnerLoop});
+  auto x = aiOnnx.add({a, b});
+  x      = aiOnnx.add({x, c});
+  builder->addOutputTensor(x);
+
+  auto proto    = builder->getModelProto();
+  auto dataFlow = DataFlow(5, {{x, AnchorReturnType("All")}});
+
+  SessionOptions opts;
+  opts.virtualGraphMode        = VirtualGraphMode::Auto;
+  opts.enableExplicitMainLoops = true;
+  opts.useHostCopyOps          = true;
+  opts.numIOTiles              = 32;
+  opts.experimentalSettings.createHostTransferableTensorWithOffset = true;
+
+  auto device = createTestDevice(TEST_TARGET, 1);
+
+  auto session = popart::InferenceSession::createFromOnnxModel(
+      proto,
+      dataFlow,
+      device,
+      InputShapeInfo(),
+      opts,
+      popart::Patterns(PatternsLevel::Default));
+
+  session->prepareDevice();
+
+  using Mapping = poplar::Graph::TileToTensorMapping;
+
+  auto getStartTile = [&](const Mapping &ans) {
+    unsigned index = 0;
+    for (unsigned i = 0; i < ans.size(); ++i) {
+      if (!ans[i].empty()) {
+        index = i;
+        break;
+      }
+    }
+    return index;
+  };
+
+  std::map<std::string, unsigned> startMappings;
+  auto &irLowering = session->getDevice().lowering();
+  const auto &ir   = irLowering.ir();
+  for (auto &id : ir.getAllTensorIds()) {
+    auto *t = ir.getTensor(id);
+    if (t->isHostLoadTensor()) {
+      auto vgid      = t->getVirtualGraphIdAndTileSetUnsafe();
+      auto &graph    = irLowering.getVirtualGraph(vgid.first, vgid.second);
+      auto &tensor   = irLowering.tensors().get(t->id);
+      const auto &tm = graph.getTileMapping(tensor);
+      auto startTile = getStartTile(graph.getTileMapping(tensor));
+      startMappings[t->id] = startTile;
+      std::cout << t->id << " : " << tm << std::endl;
+    }
+  }
+
+  std::set<unsigned> uniqueMappings;
+  for (const auto &mappings : startMappings) {
+    BOOST_CHECK(uniqueMappings.insert(mappings.second).second == true);
+  }
+  BOOST_CHECK(uniqueMappings.size() == startMappings.size());
+}
diff --git a/tests/integration/options_test.py b/tests/integration/options_test.py
@@ -164,3 +164,9 @@ def test_updatableNamedBuffers_option():
     assert len(opts.updatableNamedBuffers) == 2
     assert opts.updatableNamedBuffers[0] == "t1"
     assert opts.updatableNamedBuffers[1] == "t2"
+
+
+def test_createHostTransferableTensorWithOffset():
+    opts = popart.SessionOptions()
+    opts.experimentalSettings.createHostTransferableTensorWithOffset = True
+    assert opts.experimentalSettings.createHostTransferableTensorWithOffset is True
diff --git a/willow/include/popart/docs/pydocs_popart_core.hpp b/willow/include/popart/docs/pydocs_popart_core.hpp
@@ -14919,6 +14919,14 @@ static const char
 static const char *__doc_popart_SessionOptions_stashAllTensorsInferencePipeline =
     R"doc(Specify whether to enable stash all needed tensors when inference pipeline. Default :code:`false` (disabled).)doc";
 
+static const char *
+    __doc_popart_SessionOptions_ExperimentalSettings_createHostTransferableTensorWithOffset =
+        R"doc(Accumulate the created tensors bytes, rotate the start tile of the next tensor to balance the tile maapping. Especially when there are a lot of small input tensors, enable it can avoid mapping on tile0 all the time.)doc";
+
+static const char *
+    __singlelinedoc_popart_SessionOptions_ExperimentalSettings_createHostTransferableTensorWithOffset =
+        R"doc(Accumulate the created tensors bytes, rotate the start tile of the next tensor to balance the tile maapping.)doc";
+
 static const char *__doc_popart_SessionOptions_updatableNamedBuffers =
     R"doc(List of model named buffers that can be updated with call to
 buffersFromHost(). This allows to update just a subset of model weights
diff --git a/willow/include/popart/popx/inittensoroffsetmap.hpp b/willow/include/popart/popx/inittensoroffsetmap.hpp
@@ -0,0 +1,29 @@
+// Copyright (c) 2023 Graphcore Ltd. All rights reserved.
+#ifndef POPART_WILLOW_INCLUDE_POPART_POPX_INITTENSOROFFSETMAP_HPP_
+#define POPART_WILLOW_INCLUDE_POPART_POPX_INITTENSOROFFSETMAP_HPP_
+
+#include <cstddef>
+#include <map>
+
+namespace poplar {
+class Graph;
+class Tensor;
+} // namespace poplar
+
+namespace popart {
+namespace popx {
+
+class InitTensorOffsetMap {
+public:
+  std::size_t getOffset(poplar::Graph &graph);
+  void setOffset(poplar::Graph &graph, const std::size_t offset);
+
+private:
+  // offset, the created tensor bytes
+  std::map<poplar::Graph *, std::size_t> offsets;
+};
+
+} // namespace popx
+} // namespace popart
+
+#endif // POPART_WILLOW_INCLUDE_POPART_POPX_INITTENSOROFFSETMAP_HPP_
diff --git a/willow/include/popart/popx/irlowering.hpp b/willow/include/popart/popx/irlowering.hpp
@@ -25,6 +25,7 @@
 #include <popart/names.hpp>
 #include <popart/popx/exchangebundle.hpp>
 #include <popart/popx/inittensor.hpp>
+#include <popart/popx/inittensoroffsetmap.hpp>
 #include <popart/popx/linearmapper.hpp>
 #include <popart/popx/namesx.hpp>
 #include <popart/popx/popprograms.hpp>
@@ -229,6 +230,11 @@ class IrLowering {
   // Map tensors evenly across all tiles
   LinearMapper linearMapper;
 
+  // Save the created tensor bytes which used to help rotate
+  // the next mapping start tile, create init tensors evenly
+  // across all tiles
+  InitTensorOffsetMap initTensorOffsetMap;
+
   poplar::Tensor randomSeedTensor;
 
   // TODO T11630: Combine the inputStreams/outputStreams with the
@@ -632,6 +638,8 @@ class IrLowering {
 
   LinearMapper &getLinearMapper() { return linearMapper; }
 
+  InitTensorOffsetMap &getInitTensorOffsetMap() { return initTensorOffsetMap; }
+
   const liveness::LivenessAnalyzer *getLivenessAnalyzer() const {
     return livenessAnalyzer.get();
   }
diff --git a/willow/include/popart/sessionoptions.hpp b/willow/include/popart/sessionoptions.hpp
@@ -1388,6 +1388,16 @@ struct SessionOptions {
      */
     std::map<std::string, std::vector<std::string>>
         customTransformApplierSettings;
+
+    /**
+     * Accumulate the created tensors bytes, rotate the start
+     * tile of the next tensor to balance the tile mapping.
+     * Especially when there are a lot of small input tensors, enable it
+     * can avoid mapping on tile0 all the time.
+     *
+     * Default=`false`.
+     */
+    bool createHostTransferableTensorWithOffset = false;
   };
 
   /// Configuration setting for custom transform applier.
diff --git a/willow/src/popx/inittensoroffsetmap.cpp b/willow/src/popx/inittensoroffsetmap.cpp
@@ -0,0 +1,30 @@
+// Copyright (c) 2023 Graphcore Ltd. All rights reserved.
+#include <algorithm>
+#include <cstddef>
+#include <map>
+#include <vector>
+#include <poplar/Graph.hpp>
+#include <poplar/Target.hpp>
+#include <popart/popx/inittensoroffsetmap.hpp>
+
+namespace popart {
+namespace popx {
+
+std::size_t InitTensorOffsetMap::getOffset(poplar::Graph &graph) {
+  auto findIt = offsets.find(&graph);
+
+  if (findIt == offsets.end()) {
+    offsets.insert({&graph, 0});
+    return 0;
+  } else {
+    return findIt->second;
+  }
+}
+
+void InitTensorOffsetMap::setOffset(poplar::Graph &graph,
+                                    const std::size_t offset) {
+  offsets[&graph] = offset;
+}
+
+} // namespace popx
+} // namespace popart
diff --git a/willow/src/popx/op/exchange/exchangex.cpp b/willow/src/popx/op/exchange/exchangex.cpp
@@ -193,13 +193,37 @@ poplar::Tensor ExchangeDescriptorx::create(poplar::Graph &graph,
           ? descriptor.getHostStreamTensorId()
           : std::to_string(descriptor.getRemoteBufferId());
 
+  poplar::Tensor t;
+  auto &lowering = dv_p->lowering();
+  auto withOffset =
+      lowering.ir()
+          .getSessionOptions()
+          .experimentalSettings.createHostTransferableTensorWithOffset;
+
   // Note: ExchangeDirection::Store means isRead is true for the host side
-  return popops::createHostTransferableTensor(
-      graph,
-      popType(info.getDataTypeInfo()->type()),
-      info.shape_szt(),
-      descriptor.getDirection() == ExchangeDirection::Store,
-      {debugContext});
+  if (!withOffset) {
+    t = popops::createHostTransferableTensor(
+        graph,
+        popType(info.getDataTypeInfo()->type()),
+        info.shape_szt(),
+        descriptor.getDirection() == ExchangeDirection::Store,
+        {debugContext});
+  } else {
+    auto &offsetMap = lowering.getInitTensorOffsetMap();
+    auto offset     = offsetMap.getOffset(graph);
+    t               = popops::createHostTransferableTensor(
+        graph,
+        popType(info.getDataTypeInfo()->type()),
+        info.shape_szt(),
+        descriptor.getDirection() == ExchangeDirection::Store,
+        offset,
+        {debugContext});
+    auto dtype = popType(info.getDataTypeInfo()->type());
+    offset += graph.getTarget().getTypeSize(dtype) * t.numElements();
+    offsetMap.setOffset(graph, offset);
+  }
+
+  return t;
 }
 
 std::unique_ptr<ExchangeDescriptorx>

Original file line number	Diff line number	Diff line change
`@@ -1356,6 +1356,9 @@ PYBIND11_MODULE(popart_core, m) {`
`1356`	`1356`	`cls.def_readwrite(`
`1357`	`1357`	`"_customTransformApplierSettings",`
`1358`	`1358`	`&SessionOptions::ExperimentalSettings::customTransformApplierSettings);`
	`1359`	`+ cls.def_readwrite("createHostTransferableTensorWithOffset",`
	`1360`	`+ &SessionOptions::ExperimentalSettings::`
	`1361`	`+ createHostTransferableTensorWithOffset);`
`1359`	`1362`	`}`
`1360`	`1363`	`{`
`1361`	`1364`	`py::class_<SessionOptions> cls(m, "SessionOptions");`