Skip to content

Commit f00b735

Browse files
add more efficient way to add constrains in OverlapIO
Summary: !ci_branch_mk2 Improve overlap IO : OverlapIO create new N*N*L*L topocons N : number of bins in each part (8 for before loop, 7 for inside loop, 6 for after loop) L : for a very wide model, this could be large number of inputs for some layer Suppose we have 1000 inputs, then 3x1000 initOp and 3x1000 hostLoadOp, 3x1000 IOComputeCopy, 3x1000 ComputeIOCopy. In each of 3 stages (before Loop, inside Loop, after Loop), to make sure all initOps are scheduled before HostLoads, we only need to make sure 1. the last of InitOp are scheduled before the first HostLoad (in topological order); 2. the last of HostOp are scheduled before the first IoCompute; 3. the last of ComputeIo are scheduled before the first HostStore; then constrains of complexity O(N*N*L*L) is reduced to O(N*L) for very wide model with a large number of inputs. Proposal : https://graphcore.atlassian.net/wiki/spaces/CHINA/pages/3279061011/Optimize+OverlapIO+in+very+widespread+model Test Plan: The codes have been tested with this overlap io code: python: ``` # Copyright (c) 2023 Graphcore Ltd. All rights reserved. import numpy as np import popart BPS = 16 NUM_INPUTS = 1000 INPUT_SHAPE = [512, 1] if __name__ == '__main__': builder = popart.Builder() # Create many input tensors in_tensors = [] for i in range(NUM_INPUTS): inp = builder.addInputTensor( popart.TensorInfo("FLOAT16", INPUT_SHAPE), popart.InputSettings( popart.TileSet.IO, popart.ExchangeStrategy.OverlapInnerLoop ), debugContext="input" ) in_tensors.append(inp) # Computation part is quite simple , just concat all the input tensors and reduce cc = builder.aiOnnx.concat(in_tensors, 1) o = builder.aiOnnx.reducesum([cc]) builder.addOutputTensor(o) proto = builder.getModelProto() anchors = {o: popart.AnchorReturnType("ALL", popart.TileSet.IO, popart.ExchangeStrategy.OverlapInnerLoop) } # anchors = {o : popart.AnchorReturnType("ALL")} dataFlow = popart.DataFlow(BPS, anchors) device = popart.DeviceManager().acquireAvailableDevice(1) opts = popart.SessionOptions() opts.numIOTiles = 128 opts.virtualGraphMode = popart.VirtualGraphMode.Auto opts.enableExplicitMainLoops = True opts.useHostCopyOps = True opts.defaultBufferingDepth = 2 opts.rearrangeAnchorsOnHost = False opts.rearrangeStreamsOnHost = False session = popart.InferenceSession(fnModel=proto, dataFlow=dataFlow, userOptions=opts, deviceInfo=device) session.prepareDevice() anchors = session.initAnchorArrays() inputs = {} for i in range(NUM_INPUTS): inputs[in_tensors[i]] = np.random.randn(BPS, *INPUT_SHAPE).astype(np.float16) stepio = popart.PyStepIO(inputs, anchors) for i in range(4): session.run(stepio) ``` Snapshot of compilation acceleration: inputs 50: {F2693165} inputs 200 12 mins : {F2694251} {F2694278} inputs 500 3.5 hrs = (subgraphoutline 3 hrs + aliasmodel growing 0.5 hrs) (tile0 OOM, this should disappear once alex's diff applied): {F2694273} {F2694279} Reviewers: #popart, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, hanzhao, matthewha Reviewed By: #popart, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, hanzhao, matthewha Subscribers: hanzhao, matthewha, alleng Maniphest Tasks: T74421 Differential Revision: https://phabricator.sourcevertex.net/D85429
1 parent 0e887be commit f00b735

File tree

7 files changed

+510
-3
lines changed

7 files changed

+510
-3
lines changed

python/popart/popart_core/popart.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1668,6 +1668,10 @@ PYBIND11_MODULE(popart_core, m) {
16681668
cls.def_readwrite("useHostCopyOps",
16691669
&SessionOptions::useHostCopyOps,
16701670
DOC(popart, SessionOptions, useHostCopyOps));
1671+
cls.def_readwrite(
1672+
"enableEfficientOverlapIOTopoCons",
1673+
&SessionOptions::enableEfficientOverlapIOTopoCons,
1674+
DOC(popart, SessionOptions, enableEfficientOverlapIOTopoCons));
16711675
cls.def_readwrite(
16721676
"enableSupportedDataTypeCasting",
16731677
&SessionOptions::enableSupportedDataTypeCasting,

tests/unittests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ add_unit_test(unittest_stepio_deonnxing_regression_tests stepio/deonnxing_regres
9898

9999
add_unit_test(unittest_autodiff transforms/unittest_autodiff.cpp SUPPORT_LIBS ir-query-test-util test-graphs-test-util)
100100
add_unit_test(unittest_overlapio transforms/unittest_overlapio.cpp SUPPORT_LIBS ir-query-test-util test-graphs-test-util)
101+
add_unit_test(unittest_efficient_overlapio transforms/unittest_efficient_overlapio.cpp SUPPORT_LIBS ir-query-test-util test-graphs-test-util)
101102
add_unit_test(unittest_mainloops transforms/unittest_mainloops.cpp VARIANTS Cpu)
102103
add_unit_test(unittest_backwardsgraphcreator transforms/autodiff/unittest_backwardsgraphcreator.cpp)
103104
add_unit_test(unittest_calledgraphgradophelper transforms/autodiff/unittest_calledgraphgradophelper.cpp)
Lines changed: 334 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,334 @@
1+
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
2+
#define BOOST_TEST_MODULE overlapio_unittest
3+
4+
#include <boost/test/unit_test.hpp>
5+
#include <cstddef>
6+
#include <functional>
7+
#include <map>
8+
#include <testutil/test_graphs/graph_test_models.hpp>
9+
#include <vector>
10+
#include <popart/graph.hpp>
11+
#include <popart/ir.hpp>
12+
#include <popart/op.hpp>
13+
#include <popart/op/add.hpp>
14+
#include <popart/op/exchange/exchange.hpp>
15+
#include <popart/op/exchange/hostcopy.hpp>
16+
#include <popart/op/init.hpp>
17+
#include <popart/op/iotilecopy.hpp>
18+
#include <popart/op/loop.hpp>
19+
#include <popart/transforms/overlapio.hpp>
20+
21+
#include "popart/graphid.hpp"
22+
#include "popart/logging.hpp"
23+
#include "popart/names.hpp"
24+
#include "popart/scheduler_requireoptimal.hpp"
25+
#include "popart/transforms/mainloops.hpp"
26+
#include "popart/vendored/optional.hpp"
27+
#include "testutil/irquery/irquery.hpp"
28+
29+
using namespace popart;
30+
using namespace popart::irquery;
31+
32+
namespace {
33+
template <typename T>
34+
bool checkSchedule(const std::vector<Op *> &schedule, size_t &j) {
35+
return schedule.at(j++)->isConvertibleTo<T>();
36+
}
37+
} // namespace
38+
39+
// Test overlap IO graph when all inputs and outputs are overlapped for the
40+
// inner loop
41+
BOOST_AUTO_TEST_CASE(OverlapInnerLoop) {
42+
GraphTestModel3 model(ExchangeStrategy::OverlapInnerLoop,
43+
ExchangeStrategy::OverlapInnerLoop,
44+
ExchangeStrategy::OverlapInnerLoop);
45+
46+
auto &ir = model.getIr();
47+
auto &opts = ir.getSessionOptions();
48+
opts.enableEfficientOverlapIOTopoCons = true;
49+
ir.applyTransform(OverlapIO::id(), ir.getMainGraph());
50+
ir.updateVertices();
51+
ir.setIsPrepared();
52+
53+
ir.dotCheckpoint(ir, "Final");
54+
55+
IrTestWrapper tw_ir{ir};
56+
auto tw_mainGraph = tw_ir.hasGraph(ir.getMainGraph().id, Require::MustBeTrue);
57+
auto tw_outerLoopOp = tw_mainGraph->ops().hasOp<LoopOp>(
58+
[&](auto &tw_op) -> bool {
59+
return tw_op.unwrap()->getCalledGraphs().at(0)->id ==
60+
GraphId(MainLoops::getStepGraphName());
61+
},
62+
Require::MustBeTrue);
63+
64+
BOOST_REQUIRE_EQUAL(tw_mainGraph->unwrap().get().getOps().size(), 1);
65+
66+
auto outerLoopOp = tw_outerLoopOp->unwrap();
67+
GraphId stepGraphId = outerLoopOp->getCalledGraph().id;
68+
auto tw_stepGraph = tw_ir.hasGraph(stepGraphId, Require::MustBeTrue);
69+
70+
BOOST_REQUIRE_EQUAL(tw_stepGraph->unwrap().get().getInputIds().size(), 2);
71+
BOOST_REQUIRE_EQUAL(tw_stepGraph->unwrap().get().getOutputIds().size(), 1);
72+
73+
auto tw_innerLoopOp = tw_stepGraph->ops().hasOp<LoopOp>(
74+
[&](auto &tw_op) -> bool {
75+
return tw_op.unwrap()->getCalledGraphs().at(0)->id ==
76+
GraphId(MainLoops::getAccumulationGraphName());
77+
},
78+
Require::MustBeTrue);
79+
80+
auto innerLoopOp = tw_innerLoopOp->unwrap();
81+
GraphId accumGraphId = innerLoopOp->getCalledGraph().id;
82+
auto tw_accumGraph = tw_ir.hasGraph(accumGraphId, Require::MustBeTrue);
83+
84+
BOOST_REQUIRE_EQUAL(tw_accumGraph->unwrap().get().getInputIds().size(), 5);
85+
BOOST_REQUIRE_EQUAL(tw_accumGraph->unwrap().get().getOutputIds().size(), 4);
86+
87+
auto stepGraphSchedule = tw_stepGraph->unwrap().get().getOpSchedule(
88+
{}, RequireOptimalSchedule::Yes);
89+
for (size_t i = 0; i < stepGraphSchedule.size(); ++i) {
90+
logging::trace(
91+
"Step graph: {}: {}", i, stepGraphSchedule.at(i)->debugName());
92+
}
93+
94+
{
95+
size_t j = 0;
96+
BOOST_REQUIRE(checkSchedule<InitOp>(stepGraphSchedule, j));
97+
BOOST_REQUIRE(checkSchedule<InitOp>(stepGraphSchedule, j));
98+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(stepGraphSchedule, j));
99+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(stepGraphSchedule, j));
100+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
101+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
102+
BOOST_REQUIRE(checkSchedule<InitOp>(stepGraphSchedule, j));
103+
BOOST_REQUIRE(checkSchedule<InitOp>(stepGraphSchedule, j));
104+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(stepGraphSchedule, j));
105+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(stepGraphSchedule, j));
106+
BOOST_REQUIRE(checkSchedule<AddOp>(stepGraphSchedule, j));
107+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
108+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
109+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
110+
BOOST_REQUIRE(checkSchedule<LoopOp>(stepGraphSchedule, j));
111+
BOOST_REQUIRE(checkSchedule<HostStoreOp>(stepGraphSchedule, j));
112+
BOOST_REQUIRE(checkSchedule<AddOp>(stepGraphSchedule, j));
113+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
114+
BOOST_REQUIRE(checkSchedule<HostStoreOp>(stepGraphSchedule, j));
115+
}
116+
117+
auto accumGraphSchedule = tw_accumGraph->unwrap().get().getOpSchedule(
118+
{}, RequireOptimalSchedule::Yes);
119+
for (size_t i = 0; i < accumGraphSchedule.size(); ++i) {
120+
logging::trace(
121+
"Accum graph: {}: {}", i, accumGraphSchedule.at(i)->debugName());
122+
}
123+
124+
{
125+
size_t j = 0;
126+
BOOST_REQUIRE(checkSchedule<InitOp>(accumGraphSchedule, j));
127+
BOOST_REQUIRE(checkSchedule<InitOp>(accumGraphSchedule, j));
128+
BOOST_REQUIRE(checkSchedule<HostStoreOp>(accumGraphSchedule, j));
129+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(accumGraphSchedule, j));
130+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(accumGraphSchedule, j));
131+
BOOST_REQUIRE(checkSchedule<AddOp>(accumGraphSchedule, j));
132+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(accumGraphSchedule, j));
133+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(accumGraphSchedule, j));
134+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(accumGraphSchedule, j));
135+
}
136+
}
137+
138+
// Test overlap IO graph when only input A is overlapped for the inner loop
139+
// (see input A in GraphTestModel3)
140+
BOOST_AUTO_TEST_CASE(OverlapInnerLoopA) {
141+
GraphTestModel3 model(ExchangeStrategy::OverlapInnerLoop,
142+
ExchangeStrategy::JustInTime,
143+
ExchangeStrategy::JustInTime);
144+
145+
auto &ir = model.getIr();
146+
auto &opts = ir.getSessionOptions();
147+
opts.enableEfficientOverlapIOTopoCons = true;
148+
ir.applyTransform(OverlapIO::id(), ir.getMainGraph());
149+
ir.updateVertices();
150+
ir.setIsPrepared();
151+
152+
ir.dotCheckpoint(ir, "Final");
153+
154+
IrTestWrapper tw_ir{ir};
155+
auto tw_mainGraph = tw_ir.hasGraph(ir.getMainGraph().id, Require::MustBeTrue);
156+
auto tw_outerLoopOp = tw_mainGraph->ops().hasOp<LoopOp>(
157+
[&](auto &tw_op) -> bool {
158+
return tw_op.unwrap()->getCalledGraphs().at(0)->id ==
159+
GraphId(MainLoops::getStepGraphName());
160+
},
161+
Require::MustBeTrue);
162+
163+
BOOST_REQUIRE_EQUAL(tw_mainGraph->unwrap().get().getOps().size(), 1);
164+
165+
auto outerLoopOp = tw_outerLoopOp->unwrap();
166+
GraphId stepGraphId = outerLoopOp->getCalledGraph().id;
167+
auto tw_stepGraph = tw_ir.hasGraph(stepGraphId, Require::MustBeTrue);
168+
169+
BOOST_REQUIRE_EQUAL(tw_stepGraph->unwrap().get().getInputIds().size(), 2);
170+
BOOST_REQUIRE_EQUAL(tw_stepGraph->unwrap().get().getOutputIds().size(), 1);
171+
172+
auto tw_innerLoopOp = tw_stepGraph->ops().hasOp<LoopOp>(
173+
[&](auto &tw_op) -> bool {
174+
return tw_op.unwrap()->getCalledGraphs().at(0)->id ==
175+
GraphId(MainLoops::getAccumulationGraphName());
176+
},
177+
Require::MustBeTrue);
178+
179+
auto innerLoopOp = tw_innerLoopOp->unwrap();
180+
GraphId accumGraphId = innerLoopOp->getCalledGraph().id;
181+
auto tw_accumGraph = tw_ir.hasGraph(accumGraphId, Require::MustBeTrue);
182+
183+
BOOST_REQUIRE_EQUAL(tw_accumGraph->unwrap().get().getInputIds().size(), 3);
184+
BOOST_REQUIRE_EQUAL(tw_accumGraph->unwrap().get().getOutputIds().size(), 2);
185+
186+
auto stepGraphSchedule = tw_stepGraph->unwrap().get().getOpSchedule(
187+
{}, RequireOptimalSchedule::Yes);
188+
for (size_t i = 0; i < stepGraphSchedule.size(); ++i) {
189+
logging::trace(
190+
"Step graph: {}: {}", i, stepGraphSchedule.at(i)->debugName());
191+
}
192+
193+
{
194+
size_t j = 0;
195+
BOOST_REQUIRE(checkSchedule<InitOp>(stepGraphSchedule, j));
196+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(stepGraphSchedule, j));
197+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
198+
BOOST_REQUIRE(checkSchedule<InitOp>(stepGraphSchedule, j));
199+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(stepGraphSchedule, j));
200+
BOOST_REQUIRE(checkSchedule<InitOp>(stepGraphSchedule, j));
201+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(stepGraphSchedule, j));
202+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
203+
BOOST_REQUIRE(checkSchedule<AddOp>(stepGraphSchedule, j));
204+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
205+
BOOST_REQUIRE(checkSchedule<HostStoreOp>(stepGraphSchedule, j));
206+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
207+
BOOST_REQUIRE(checkSchedule<LoopOp>(stepGraphSchedule, j));
208+
BOOST_REQUIRE(checkSchedule<InitOp>(stepGraphSchedule, j));
209+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(stepGraphSchedule, j));
210+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
211+
BOOST_REQUIRE(checkSchedule<AddOp>(stepGraphSchedule, j));
212+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
213+
BOOST_REQUIRE(checkSchedule<HostStoreOp>(stepGraphSchedule, j));
214+
}
215+
216+
auto accumGraphSchedule = tw_accumGraph->unwrap().get().getOpSchedule(
217+
{}, RequireOptimalSchedule::Yes);
218+
for (size_t i = 0; i < accumGraphSchedule.size(); ++i) {
219+
logging::trace(
220+
"Accum graph: {}: {}", i, accumGraphSchedule.at(i)->debugName());
221+
}
222+
223+
{
224+
size_t j = 0;
225+
BOOST_REQUIRE(checkSchedule<InitOp>(accumGraphSchedule, j));
226+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(accumGraphSchedule, j));
227+
BOOST_REQUIRE(checkSchedule<InitOp>(accumGraphSchedule, j));
228+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(accumGraphSchedule, j));
229+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(accumGraphSchedule, j));
230+
BOOST_REQUIRE(checkSchedule<AddOp>(accumGraphSchedule, j));
231+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(accumGraphSchedule, j));
232+
BOOST_REQUIRE(checkSchedule<HostStoreOp>(accumGraphSchedule, j));
233+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(accumGraphSchedule, j));
234+
}
235+
}
236+
237+
// Test overlap IO graph when only output C is overlapped for the inner loop
238+
// (see output C in GraphTestModel3)
239+
BOOST_AUTO_TEST_CASE(OverlapInnerLoopC) {
240+
GraphTestModel3 model(ExchangeStrategy::JustInTime,
241+
ExchangeStrategy::JustInTime,
242+
ExchangeStrategy::OverlapInnerLoop);
243+
244+
auto &ir = model.getIr();
245+
auto &opts = ir.getSessionOptions();
246+
opts.enableEfficientOverlapIOTopoCons = true;
247+
ir.applyTransform(OverlapIO::id(), ir.getMainGraph());
248+
ir.updateVertices();
249+
ir.setIsPrepared();
250+
251+
ir.dotCheckpoint(ir, "Final");
252+
253+
IrTestWrapper tw_ir{ir};
254+
auto tw_mainGraph = tw_ir.hasGraph(ir.getMainGraph().id, Require::MustBeTrue);
255+
auto tw_outerLoopOp = tw_mainGraph->ops().hasOp<LoopOp>(
256+
[&](auto &tw_op) -> bool {
257+
return tw_op.unwrap()->getCalledGraphs().at(0)->id ==
258+
GraphId(MainLoops::getStepGraphName());
259+
},
260+
Require::MustBeTrue);
261+
262+
BOOST_REQUIRE_EQUAL(tw_mainGraph->unwrap().get().getOps().size(), 1);
263+
264+
auto outerLoopOp = tw_outerLoopOp->unwrap();
265+
GraphId stepGraphId = outerLoopOp->getCalledGraph().id;
266+
auto tw_stepGraph = tw_ir.hasGraph(stepGraphId, Require::MustBeTrue);
267+
268+
BOOST_REQUIRE_EQUAL(tw_stepGraph->unwrap().get().getInputIds().size(), 2);
269+
BOOST_REQUIRE_EQUAL(tw_stepGraph->unwrap().get().getOutputIds().size(), 1);
270+
271+
auto tw_innerLoopOp = tw_stepGraph->ops().hasOp<LoopOp>(
272+
[&](auto &tw_op) -> bool {
273+
return tw_op.unwrap()->getCalledGraphs().at(0)->id ==
274+
GraphId(MainLoops::getAccumulationGraphName());
275+
},
276+
Require::MustBeTrue);
277+
278+
auto innerLoopOp = tw_innerLoopOp->unwrap();
279+
GraphId accumGraphId = innerLoopOp->getCalledGraph().id;
280+
auto tw_accumGraph = tw_ir.hasGraph(accumGraphId, Require::MustBeTrue);
281+
282+
BOOST_REQUIRE_EQUAL(tw_accumGraph->unwrap().get().getInputIds().size(), 3);
283+
BOOST_REQUIRE_EQUAL(tw_accumGraph->unwrap().get().getOutputIds().size(), 2);
284+
285+
auto stepGraphSchedule = tw_stepGraph->unwrap().get().getOpSchedule(
286+
{}, RequireOptimalSchedule::Yes);
287+
for (size_t i = 0; i < stepGraphSchedule.size(); ++i) {
288+
logging::trace(
289+
"Step graph: {}: {}", i, stepGraphSchedule.at(i)->debugName());
290+
}
291+
292+
{
293+
size_t j = 0;
294+
BOOST_REQUIRE(checkSchedule<InitOp>(stepGraphSchedule, j));
295+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(stepGraphSchedule, j));
296+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
297+
BOOST_REQUIRE(checkSchedule<InitOp>(stepGraphSchedule, j));
298+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(stepGraphSchedule, j));
299+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
300+
BOOST_REQUIRE(checkSchedule<AddOp>(stepGraphSchedule, j));
301+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
302+
BOOST_REQUIRE(checkSchedule<LoopOp>(stepGraphSchedule, j));
303+
BOOST_REQUIRE(checkSchedule<HostStoreOp>(stepGraphSchedule, j));
304+
BOOST_REQUIRE(checkSchedule<InitOp>(stepGraphSchedule, j));
305+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(stepGraphSchedule, j));
306+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
307+
BOOST_REQUIRE(checkSchedule<InitOp>(stepGraphSchedule, j));
308+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(stepGraphSchedule, j));
309+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
310+
BOOST_REQUIRE(checkSchedule<AddOp>(stepGraphSchedule, j));
311+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(stepGraphSchedule, j));
312+
BOOST_REQUIRE(checkSchedule<HostStoreOp>(stepGraphSchedule, j));
313+
}
314+
315+
auto accumGraphSchedule = tw_accumGraph->unwrap().get().getOpSchedule(
316+
{}, RequireOptimalSchedule::Yes);
317+
for (size_t i = 0; i < accumGraphSchedule.size(); ++i) {
318+
logging::trace(
319+
"Accum graph: {}: {}", i, accumGraphSchedule.at(i)->debugName());
320+
}
321+
322+
{
323+
size_t j = 0;
324+
BOOST_REQUIRE(checkSchedule<HostStoreOp>(accumGraphSchedule, j));
325+
BOOST_REQUIRE(checkSchedule<InitOp>(accumGraphSchedule, j));
326+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(accumGraphSchedule, j));
327+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(accumGraphSchedule, j));
328+
BOOST_REQUIRE(checkSchedule<InitOp>(accumGraphSchedule, j));
329+
BOOST_REQUIRE(checkSchedule<HostLoadOp>(accumGraphSchedule, j));
330+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(accumGraphSchedule, j));
331+
BOOST_REQUIRE(checkSchedule<AddOp>(accumGraphSchedule, j));
332+
BOOST_REQUIRE(checkSchedule<IoTileCopyOp>(accumGraphSchedule, j));
333+
}
334+
}

willow/include/popart/docs/pydocs_popart_core.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14968,6 +14968,23 @@ Default: :code:`false` (not enabled).)doc";
1496814968
static const char *__singlelinedoc_popart_SessionOptions_useHostCopyOps =
1496914969
R"doc(Enable use of IR graph operations for data and anchor streams. Default: :code:`false` (not enabled).)doc";
1497014970

14971+
static const char
14972+
*__doc_popart_SessionOptions_enableEfficientOverlapIOTopoCons =
14973+
R"doc(Enable efficient overlap io topo constraints.
14974+
14975+
Suppose we have the N bins in each of three stage(8 for before loop /7 for insdie loop /6 for after loop),
14976+
and L ops for each bins, vallina implementaiton of overlapio creates topocons of complexity O(N*N*L*L).
14977+
14978+
To make sure InitOps in each step are scheduled before HostLoadOps, we only need to keep topo constrains in each bin
14979+
and let the last of op of each bin Bin0 is scheduled before the first op of Bin1 next to Bin0. Then total
14980+
complexity O(N*N*L*L) is reduced to (N*L).
14981+
14982+
Default: :code:`false` (not enabled).)doc";
14983+
14984+
static const char
14985+
*__singlelinedoc_popart_SessionOptions_enableEfficientOverlapIOTopoCons =
14986+
R"doc(Enable efficient overlap io topo constrains. Suppose we have the N bins in each of three stage(8 for before loop /7 for insdie loop /6 for after loop), and L ops for each bins, vallina implementaiton of overlapio creates topocons of complexity O(N*N*L*L). To make sure InitOps in each step are scheduled before HostLoadOps, we only need to keep topo constrains in each bin and let the last of op of each bin Bin0 is scheduled before the first op of Bin1 next to Bin0. Then total complexity O(N*N*L*L) is reduced to (N*L). Default: :code:`false` (not enabled).)doc";
14987+
1497114988
static const char *__doc_popart_SessionOptions_virtualGraphMode =
1497214989
R"doc(Specify how to place ops on virtual graphs to achieve model
1497314990
parallelism, either manually using model annotations, or automatically.

0 commit comments

Comments
 (0)