Skip to content

Commit 0e887be

Browse files
committed
add createHostTransferableTensorWithOffset session option to create host transferable tensors with tile mapping balance
Summary: !ci_branch_mk2 T74481 encounter OOM on tile0 when there are a large number of small input tensors all mapping to tile0. This diff by accumulating the created tensor bytes as offset of createHostTransferableTensor() to rotate the next mapping start index to create mapping balanced input tensors. Reviewers: #popart, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, markk Reviewed By: #popart, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, markk Subscribers: leiw, matthewha, markk, hanzhao, alleng Maniphest Tasks: T74481 Differential Revision: https://phabricator.sourcevertex.net/D85103
1 parent 2e1017f commit 0e887be

File tree

10 files changed

+260
-6
lines changed

10 files changed

+260
-6
lines changed

python/popart/popart_core/popart.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1356,6 +1356,9 @@ PYBIND11_MODULE(popart_core, m) {
13561356
cls.def_readwrite(
13571357
"_customTransformApplierSettings",
13581358
&SessionOptions::ExperimentalSettings::customTransformApplierSettings);
1359+
cls.def_readwrite("createHostTransferableTensorWithOffset",
1360+
&SessionOptions::ExperimentalSettings::
1361+
createHostTransferableTensorWithOffset);
13591362
}
13601363
{
13611364
py::class_<SessionOptions> cls(m, "SessionOptions");

tests/integration/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ add_unit_test(preplan_convolutions_test preplan_convolutions_test.cpp VARIANTS "
4747
add_unit_test(preplan_matmuls_test preplan_matmuls_test.cpp VARIANTS "IpuModel2")
4848
add_unit_test(rng_mapping_test rng_mapping_test.cpp)
4949

50+
add_unit_test(inittensoroffsetmap_test inittensoroffsetmap_test.cpp VARIANTS Hw)
51+
5052
# add_popart_py_unit_test(test_util) Utility
5153
# add_popart_py_unit_test(test_session) Utility
5254

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
// Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2+
#define BOOST_TEST_MODULE init_tensor_offset_map
3+
4+
#include <algorithm>
5+
#include <any>
6+
#include <boost/test/unit_test.hpp>
7+
#include <cstddef>
8+
#include <cstdint>
9+
#include <iostream>
10+
#include <map>
11+
#include <memory>
12+
#include <string>
13+
#include <utility>
14+
#include <vector>
15+
#include <poplar/Graph.hpp>
16+
#include <poplar/Interval.hpp>
17+
18+
#include "popart/builder.gen.hpp"
19+
#include "popart/dataflow.hpp"
20+
#include "popart/debugcontext.hpp"
21+
#include "popart/inputshapeinfo.hpp"
22+
#include "popart/logging.hpp"
23+
#include "popart/names.hpp"
24+
#include "popart/patterns/patterns.hpp"
25+
#include "popart/sessionoptions.hpp"
26+
#include "popart/tensorinfo.hpp"
27+
#include "popart/util.hpp"
28+
#include "popart/voiddata.hpp"
29+
30+
// This trick is required to access the Devicex's poplar::Tensors.
31+
32+
#ifdef __clang__
33+
#pragma clang diagnostic ignored "-Wkeyword-macro"
34+
#endif
35+
#define protected public
36+
#define private public
37+
38+
#include <testdevice.hpp>
39+
#include <popart/builder.hpp>
40+
#include <popart/devicemanager.hpp>
41+
#include <popart/error.hpp>
42+
#include <popart/popx/devicex.hpp>
43+
#include <popart/popx/irlowering.hpp>
44+
#include <popart/session.hpp>
45+
#include <popart/sgd.hpp>
46+
47+
#include "popart/popx/poptensors.hpp"
48+
49+
#undef private
50+
#undef protected
51+
52+
BOOST_AUTO_TEST_CASE(InitTensorOffsetMap) {
53+
// In this test, the input tensors are the exact size of a packet bytes for
54+
// one tile, Therefore, when createHostTransferableTensorWithOffset = true,
55+
// the accumulated tensor bytes is passed to createHostTransferableTensor()
56+
// as offset, and it mapping those tensors across tiles rather than mapping
57+
// them all to tile0.
58+
59+
using namespace popart;
60+
61+
auto builder = Builder::create();
62+
auto aiOnnx = builder->aiOnnxOpset9();
63+
64+
// one packet per tile = 1024 bytes = 256 * FLOAT
65+
std::vector<int64_t> inputShape{1, 256};
66+
TensorInfo inputInfo("FLOAT", inputShape);
67+
68+
auto a = builder->addInputTensor(
69+
inputInfo, {TileSet::IO, ExchangeStrategy::OverlapInnerLoop});
70+
auto b = builder->addInputTensor(
71+
inputInfo, {TileSet::IO, ExchangeStrategy::OverlapInnerLoop});
72+
auto c = builder->addInputTensor(
73+
inputInfo, {TileSet::IO, ExchangeStrategy::OverlapInnerLoop});
74+
auto x = aiOnnx.add({a, b});
75+
x = aiOnnx.add({x, c});
76+
builder->addOutputTensor(x);
77+
78+
auto proto = builder->getModelProto();
79+
auto dataFlow = DataFlow(5, {{x, AnchorReturnType("All")}});
80+
81+
SessionOptions opts;
82+
opts.virtualGraphMode = VirtualGraphMode::Auto;
83+
opts.enableExplicitMainLoops = true;
84+
opts.useHostCopyOps = true;
85+
opts.numIOTiles = 32;
86+
opts.experimentalSettings.createHostTransferableTensorWithOffset = true;
87+
88+
auto device = createTestDevice(TEST_TARGET, 1);
89+
90+
auto session = popart::InferenceSession::createFromOnnxModel(
91+
proto,
92+
dataFlow,
93+
device,
94+
InputShapeInfo(),
95+
opts,
96+
popart::Patterns(PatternsLevel::Default));
97+
98+
session->prepareDevice();
99+
100+
using Mapping = poplar::Graph::TileToTensorMapping;
101+
102+
auto getStartTile = [&](const Mapping &ans) {
103+
unsigned index = 0;
104+
for (unsigned i = 0; i < ans.size(); ++i) {
105+
if (!ans[i].empty()) {
106+
index = i;
107+
break;
108+
}
109+
}
110+
return index;
111+
};
112+
113+
std::map<std::string, unsigned> startMappings;
114+
auto &irLowering = session->getDevice().lowering();
115+
const auto &ir = irLowering.ir();
116+
for (auto &id : ir.getAllTensorIds()) {
117+
auto *t = ir.getTensor(id);
118+
if (t->isHostLoadTensor()) {
119+
auto vgid = t->getVirtualGraphIdAndTileSetUnsafe();
120+
auto &graph = irLowering.getVirtualGraph(vgid.first, vgid.second);
121+
auto &tensor = irLowering.tensors().get(t->id);
122+
const auto &tm = graph.getTileMapping(tensor);
123+
auto startTile = getStartTile(graph.getTileMapping(tensor));
124+
startMappings[t->id] = startTile;
125+
std::cout << t->id << " : " << tm << std::endl;
126+
}
127+
}
128+
129+
std::set<unsigned> uniqueMappings;
130+
for (const auto &mappings : startMappings) {
131+
BOOST_CHECK(uniqueMappings.insert(mappings.second).second == true);
132+
}
133+
BOOST_CHECK(uniqueMappings.size() == startMappings.size());
134+
}

tests/integration/options_test.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,9 @@ def test_updatableNamedBuffers_option():
164164
assert len(opts.updatableNamedBuffers) == 2
165165
assert opts.updatableNamedBuffers[0] == "t1"
166166
assert opts.updatableNamedBuffers[1] == "t2"
167+
168+
169+
def test_createHostTransferableTensorWithOffset():
170+
opts = popart.SessionOptions()
171+
opts.experimentalSettings.createHostTransferableTensorWithOffset = True
172+
assert opts.experimentalSettings.createHostTransferableTensorWithOffset is True

willow/include/popart/docs/pydocs_popart_core.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14919,6 +14919,14 @@ static const char
1491914919
static const char *__doc_popart_SessionOptions_stashAllTensorsInferencePipeline =
1492014920
R"doc(Specify whether to enable stash all needed tensors when inference pipeline. Default :code:`false` (disabled).)doc";
1492114921

14922+
static const char *
14923+
__doc_popart_SessionOptions_ExperimentalSettings_createHostTransferableTensorWithOffset =
14924+
R"doc(Accumulate the created tensors bytes, rotate the start tile of the next tensor to balance the tile maapping. Especially when there are a lot of small input tensors, enable it can avoid mapping on tile0 all the time.)doc";
14925+
14926+
static const char *
14927+
__singlelinedoc_popart_SessionOptions_ExperimentalSettings_createHostTransferableTensorWithOffset =
14928+
R"doc(Accumulate the created tensors bytes, rotate the start tile of the next tensor to balance the tile maapping.)doc";
14929+
1492214930
static const char *__doc_popart_SessionOptions_updatableNamedBuffers =
1492314931
R"doc(List of model named buffers that can be updated with call to
1492414932
buffersFromHost(). This allows to update just a subset of model weights
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
// Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2+
#ifndef POPART_WILLOW_INCLUDE_POPART_POPX_INITTENSOROFFSETMAP_HPP_
3+
#define POPART_WILLOW_INCLUDE_POPART_POPX_INITTENSOROFFSETMAP_HPP_
4+
5+
#include <cstddef>
6+
#include <map>
7+
8+
namespace poplar {
9+
class Graph;
10+
class Tensor;
11+
} // namespace poplar
12+
13+
namespace popart {
14+
namespace popx {
15+
16+
class InitTensorOffsetMap {
17+
public:
18+
std::size_t getOffset(poplar::Graph &graph);
19+
void setOffset(poplar::Graph &graph, const std::size_t offset);
20+
21+
private:
22+
// offset, the created tensor bytes
23+
std::map<poplar::Graph *, std::size_t> offsets;
24+
};
25+
26+
} // namespace popx
27+
} // namespace popart
28+
29+
#endif // POPART_WILLOW_INCLUDE_POPART_POPX_INITTENSOROFFSETMAP_HPP_

willow/include/popart/popx/irlowering.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <popart/names.hpp>
2626
#include <popart/popx/exchangebundle.hpp>
2727
#include <popart/popx/inittensor.hpp>
28+
#include <popart/popx/inittensoroffsetmap.hpp>
2829
#include <popart/popx/linearmapper.hpp>
2930
#include <popart/popx/namesx.hpp>
3031
#include <popart/popx/popprograms.hpp>
@@ -229,6 +230,11 @@ class IrLowering {
229230
// Map tensors evenly across all tiles
230231
LinearMapper linearMapper;
231232

233+
// Save the created tensor bytes which used to help rotate
234+
// the next mapping start tile, create init tensors evenly
235+
// across all tiles
236+
InitTensorOffsetMap initTensorOffsetMap;
237+
232238
poplar::Tensor randomSeedTensor;
233239

234240
// TODO T11630: Combine the inputStreams/outputStreams with the
@@ -632,6 +638,8 @@ class IrLowering {
632638

633639
LinearMapper &getLinearMapper() { return linearMapper; }
634640

641+
InitTensorOffsetMap &getInitTensorOffsetMap() { return initTensorOffsetMap; }
642+
635643
const liveness::LivenessAnalyzer *getLivenessAnalyzer() const {
636644
return livenessAnalyzer.get();
637645
}

willow/include/popart/sessionoptions.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1388,6 +1388,16 @@ struct SessionOptions {
13881388
*/
13891389
std::map<std::string, std::vector<std::string>>
13901390
customTransformApplierSettings;
1391+
1392+
/**
1393+
* Accumulate the created tensors bytes, rotate the start
1394+
* tile of the next tensor to balance the tile mapping.
1395+
* Especially when there are a lot of small input tensors, enable it
1396+
* can avoid mapping on tile0 all the time.
1397+
*
1398+
* Default=`false`.
1399+
*/
1400+
bool createHostTransferableTensorWithOffset = false;
13911401
};
13921402

13931403
/// Configuration setting for custom transform applier.
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// Copyright (c) 2023 Graphcore Ltd. All rights reserved.
2+
#include <algorithm>
3+
#include <cstddef>
4+
#include <map>
5+
#include <vector>
6+
#include <poplar/Graph.hpp>
7+
#include <poplar/Target.hpp>
8+
#include <popart/popx/inittensoroffsetmap.hpp>
9+
10+
namespace popart {
11+
namespace popx {
12+
13+
std::size_t InitTensorOffsetMap::getOffset(poplar::Graph &graph) {
14+
auto findIt = offsets.find(&graph);
15+
16+
if (findIt == offsets.end()) {
17+
offsets.insert({&graph, 0});
18+
return 0;
19+
} else {
20+
return findIt->second;
21+
}
22+
}
23+
24+
void InitTensorOffsetMap::setOffset(poplar::Graph &graph,
25+
const std::size_t offset) {
26+
offsets[&graph] = offset;
27+
}
28+
29+
} // namespace popx
30+
} // namespace popart

willow/src/popx/op/exchange/exchangex.cpp

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -193,13 +193,37 @@ poplar::Tensor ExchangeDescriptorx::create(poplar::Graph &graph,
193193
? descriptor.getHostStreamTensorId()
194194
: std::to_string(descriptor.getRemoteBufferId());
195195

196+
poplar::Tensor t;
197+
auto &lowering = dv_p->lowering();
198+
auto withOffset =
199+
lowering.ir()
200+
.getSessionOptions()
201+
.experimentalSettings.createHostTransferableTensorWithOffset;
202+
196203
// Note: ExchangeDirection::Store means isRead is true for the host side
197-
return popops::createHostTransferableTensor(
198-
graph,
199-
popType(info.getDataTypeInfo()->type()),
200-
info.shape_szt(),
201-
descriptor.getDirection() == ExchangeDirection::Store,
202-
{debugContext});
204+
if (!withOffset) {
205+
t = popops::createHostTransferableTensor(
206+
graph,
207+
popType(info.getDataTypeInfo()->type()),
208+
info.shape_szt(),
209+
descriptor.getDirection() == ExchangeDirection::Store,
210+
{debugContext});
211+
} else {
212+
auto &offsetMap = lowering.getInitTensorOffsetMap();
213+
auto offset = offsetMap.getOffset(graph);
214+
t = popops::createHostTransferableTensor(
215+
graph,
216+
popType(info.getDataTypeInfo()->type()),
217+
info.shape_szt(),
218+
descriptor.getDirection() == ExchangeDirection::Store,
219+
offset,
220+
{debugContext});
221+
auto dtype = popType(info.getDataTypeInfo()->type());
222+
offset += graph.getTarget().getTypeSize(dtype) * t.numElements();
223+
offsetMap.setOffset(graph, offset);
224+
}
225+
226+
return t;
203227
}
204228

205229
std::unique_ptr<ExchangeDescriptorx>

0 commit comments

Comments
 (0)