Skip to content

Commit 4ccb2f2

Browse files
improving AliasModel to reduce compilation time
Summary: !ci_branch_mk2 When growing AliasModel for a very large model and a wide spread model with many ops or tensors, using of predicted size hash map could reduce the compilation time in growing aliasmodel . With this effect, for the custom model, AliasModel reduce 75% compilation time (400 -> 90 s). Reviewers: #popart, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, markk Reviewed By: #popart, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, markk Subscribers: markk, shirazb, grahamh Maniphest Tasks: T74364, T74421, T74362 Differential Revision: https://phabricator.sourcevertex.net/D84294
1 parent 16de130 commit 4ccb2f2

File tree

3 files changed

+133
-6
lines changed

3 files changed

+133
-6
lines changed

willow/include/popart/alias/aliasmodel.hpp

Lines changed: 76 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22
#ifndef POPART_WILLOW_INCLUDE_POPART_ALIAS_ALIASMODEL_HPP_
33
#define POPART_WILLOW_INCLUDE_POPART_ALIAS_ALIASMODEL_HPP_
44

5-
#include <map>
5+
#include <string>
6+
#include <unordered_map>
67
#include <vector>
78
#include <poprithms/common/multiout/opid.hpp>
89
#include <poprithms/common/multiout/tensorid.hpp>
910
#include <poprithms/memory/inplace/graph.hpp>
1011

12+
#include "popart/error.hpp"
1113
#include "popart/names.hpp"
1214

1315
namespace popart {
@@ -26,6 +28,20 @@ class AliasModel {
2628
using PoprithmsTensorId = poprithms::memory::inplace::TensorId;
2729
using PoprithmsOpId = poprithms::memory::inplace::OpId;
2830

31+
AliasModel();
32+
33+
~AliasModel() = default;
34+
35+
/**
36+
* load factor used for hash map containers
37+
*/
38+
static constexpr int loadFactor = 0.5;
39+
40+
/**
41+
* Set PopART graph
42+
*/
43+
void setGraph(const popart::Graph *graph);
44+
2945
/**
3046
* Register that a poprithms Tensor and a popart Tensor correspond to each
3147
* other. In addition to registering the Tensor correspondence, the Ops which
@@ -193,11 +209,66 @@ class AliasModel {
193209
* */
194210
poprithms::memory::inplace::Graph g;
195211

212+
/**
213+
* The PopART graph reference
214+
*/
215+
popart::Graph *thisGraph = nullptr;
216+
196217
private:
197-
std::map<TensorId, PoprithmsTensorId> toTensor_;
198-
std::map<PoprithmsTensorId, TensorId> fromTensor_;
199-
std::map<OpId, std::vector<poprithms::memory::inplace::OpId>> toOp_;
200-
std::map<poprithms::memory::inplace::OpId, OpId> fromOp_;
218+
// using a simpler hasher which is faster than std::hash derivatives and that
219+
// is safe enough in this case.
220+
struct PoprithmsTensorIdSteadyHasher {
221+
size_t operator()(const PoprithmsTensorId &id) const {
222+
// cast poprithm typed integer (int64_t) to size_t
223+
if (id.opId().get() < 0) {
224+
throw error("Unexpected negative value!");
225+
}
226+
return static_cast<size_t>(id.opId().get());
227+
}
228+
};
229+
230+
struct PoprithmsOpIdHasher {
231+
size_t operator()(const PoprithmsOpId &opid) const {
232+
// cast poprithm typed integer (int64_t) to size_t
233+
if (opid.get() < 0) {
234+
throw error("Unexpected negative value!");
235+
}
236+
return static_cast<size_t>(opid.get());
237+
}
238+
};
239+
240+
// When dealing with medium size of or even more large models,
241+
// the number of tensors and ops grow fast. By using std::unordered_map
242+
// we can reduce of insertation, query complexities from O(logN)
243+
// (N is the number of tensors of ops to deal with) to O(1). The
244+
// problem of std::unordered_map is the capcity. The initial capacity of
245+
// the container is 8, and once it is at capacity, the container will
246+
// request almost doubled memory from kernel (very slow), and copy the old
247+
// data into the new memory (slow again). Frequent request memory in a loop
248+
// is time consuming. By predicting the memory to be used, we can achieve
249+
// exact O(1) complexity for faster compilation.
250+
template <size_t GROWTH_FACTOR, typename Key, typename Val, class Hasher>
251+
void
252+
preAllocAndRehash(typename std::unordered_map<Key, Val, Hasher> &hashTable,
253+
size_t max_slots) {
254+
if (hashTable.bucket_count() < max_slots) {
255+
hashTable.reserve((hashTable.size() + max_slots) * GROWTH_FACTOR);
256+
}
257+
{
258+
float load_factor = hashTable.size() / hashTable.bucket_count();
259+
if (load_factor > AliasModel::loadFactor) {
260+
hashTable.rehash((hashTable.size() + max_slots) * GROWTH_FACTOR);
261+
}
262+
}
263+
}
264+
265+
// Warning: do not iterate over these members as as iterating over unordered
266+
// containers introduces non-determinism.
267+
std::unordered_map<TensorId, PoprithmsTensorId> toTensor_;
268+
std::unordered_map<PoprithmsTensorId, TensorId, PoprithmsTensorIdSteadyHasher>
269+
fromTensor_;
270+
std::unordered_map<OpId, std::vector<PoprithmsOpId>> toOp_;
271+
std::unordered_map<PoprithmsOpId, OpId, PoprithmsOpIdHasher> fromOp_;
201272
};
202273

203274
} // namespace popart

willow/src/alias/aliasmodel.cpp

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
// Copyright (c) 2021 Graphcore Ltd. All rights reserved.
22
#include <algorithm>
3-
#include <map>
43
#include <ostream>
54
#include <string>
5+
#include <unordered_map>
66
#include <utility>
77
#include <vector>
88
#include <poprithms/common/multiout/ioindices.hpp>
@@ -12,6 +12,7 @@
1212
#include <poprithms/ndarray/shape.hpp>
1313
#include <poprithms/util/typedinteger.hpp>
1414
#include <popart/alias/aliasmodel.hpp>
15+
#include <popart/graph.hpp>
1516
#include <popart/ir.hpp>
1617

1718
#include "popart/error.hpp"
@@ -22,12 +23,55 @@
2223
#include "popart/tensordebuginfo.hpp"
2324
#include "popart/tensorinfo.hpp"
2425

26+
// note this links to a static variables in an unnamed space
27+
namespace {
28+
/**
29+
* max initial capacity of tensors
30+
*/
31+
constexpr int maxInitTensors = 1500;
32+
33+
/**
34+
* max initial capacity of ops
35+
*/
36+
constexpr int maxInitOps = 1000;
37+
38+
/**
39+
* hash containers of tensors growing factor
40+
*/
41+
constexpr int tGrowthFactor = 2;
42+
43+
/**
44+
* hash containers of ops growing factor
45+
*/
46+
constexpr int opGrowthFactor = 2;
47+
48+
} // namespace
49+
2550
namespace popart {
2651

2752
using PoprithmsTensorId = poprithms::memory::inplace::TensorId;
2853
using PoprithmsOpId = poprithms::memory::inplace::OpId;
2954

55+
AliasModel::AliasModel() {
56+
toTensor_.reserve(maxInitTensors);
57+
fromTensor_.reserve(maxInitTensors);
58+
toOp_.reserve(maxInitOps);
59+
fromOp_.reserve(maxInitOps);
60+
}
61+
62+
void AliasModel::setGraph(const popart::Graph *graph) {
63+
thisGraph = const_cast<popart::Graph *>(graph);
64+
}
65+
3066
void AliasModel::insertTensor(const PoprithmsTensorId &id, const Tensor &t) {
67+
// as long as we predict memory used to avoid frequent malloc, hash map is
68+
// always faster than std::map both in insertation and query. This should
69+
// reduce the compilation time.
70+
auto &allTensors = thisGraph->getTensors();
71+
72+
preAllocAndRehash<tGrowthFactor>(toTensor_, allTensors.n());
73+
preAllocAndRehash<tGrowthFactor>(fromTensor_, allTensors.n());
74+
3175
toTensor_[t.id] = id;
3276
fromTensor_[id] = t.id;
3377
if (t.hasProducer()) {
@@ -40,6 +84,7 @@ void AliasModel::update(OpId oldId, OpId newId) {
4084
if (found != toOp_.cend()) {
4185
auto oldTargets = found->second;
4286
toOp_.erase(found);
87+
// found becomes invalidated immediately
4388
toOp_[newId] = oldTargets;
4489
for (auto t : oldTargets) {
4590
fromOp_[t] = newId;
@@ -56,6 +101,11 @@ void AliasModel::insertOp(PoprithmsOpId poprithmsId, OpId id) {
56101
}
57102
}
58103

104+
auto &allOps = thisGraph->getOps();
105+
106+
preAllocAndRehash<opGrowthFactor>(fromOp_, allOps.size());
107+
preAllocAndRehash<opGrowthFactor>(toOp_, allOps.size());
108+
59109
fromOp_[poprithmsId] = id;
60110
auto found = toOp_.find(id);
61111
if (found == toOp_.cend()) {

willow/src/alias/aliasmodelgrower.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ void AliasModelGrower::growFullGraph(const Graph &graph,
4949
graph.getIr().timePartitionLogger().scopedStopwatch(logging::format(
5050
"Growing full AliasModel for {}", graph.getGraphString()));
5151

52+
AliasModel &aliasModel = getAliasModelRef();
53+
aliasModel.setGraph(&graph);
54+
5255
// NOTE: This loop does not need a schedule that complies with topocons. It
5356
// may be possible to make this more efficient by getting an Op-order that is
5457
// only constrained by data order.
@@ -80,6 +83,9 @@ void AliasModelGrower::growPartialGraph(const Graph &graph,
8083
auto scopedStopwatch = graph.getIr().timePartitionLogger().scopedStopwatch(
8184
"Growing partial AliasModel");
8285

86+
AliasModel &aliasModel = getAliasModelRef();
87+
aliasModel.setGraph(&graph);
88+
8389
// When growing a AliasModel, by the nature of the Poprithms' API we
8490
// must grow ops in schedule order. However, we don't know which tensors
8591
// that precede `tensorId` in the schedule order may be aliasing `tensorId`

0 commit comments

Comments
 (0)