improving AliasModel to reduce compilation time

yiakwy-xpu-ml-framework-team · yiakwy-xpu-ml-framework-team · commit 4ccb2f2eedbb · 2023-07-21T19:20:08.000+08:00
Summary: !ci_branch_mk2 When growing AliasModel for a very large model and a wide spread model with many ops or tensors, using of predicted size hash map could reduce the compilation time in growing aliasmodel . With this effect, for the custom model, AliasModel reduce 75% compilation time (400 -> 90 s). Reviewers: #popart, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, markk Reviewed By: #popart, #framework_ip_review_-_any_oss_or_third-party_code_use_has_been_approved, markk Subscribers: markk, shirazb, grahamh Maniphest Tasks: T74364, T74421, T74362 Differential Revision: https://phabricator.sourcevertex.net/D84294
diff --git a/willow/include/popart/alias/aliasmodel.hpp b/willow/include/popart/alias/aliasmodel.hpp
@@ -2,12 +2,14 @@
 #ifndef POPART_WILLOW_INCLUDE_POPART_ALIAS_ALIASMODEL_HPP_
 #define POPART_WILLOW_INCLUDE_POPART_ALIAS_ALIASMODEL_HPP_
 
-#include <map>
+#include <string>
+#include <unordered_map>
 #include <vector>
 #include <poprithms/common/multiout/opid.hpp>
 #include <poprithms/common/multiout/tensorid.hpp>
 #include <poprithms/memory/inplace/graph.hpp>
 
+#include "popart/error.hpp"
 #include "popart/names.hpp"
 
 namespace popart {
@@ -26,6 +28,20 @@ class AliasModel {
   using PoprithmsTensorId = poprithms::memory::inplace::TensorId;
   using PoprithmsOpId     = poprithms::memory::inplace::OpId;
 
+  AliasModel();
+
+  ~AliasModel() = default;
+
+  /**
+   * load factor used for hash map containers
+   */
+  static constexpr int loadFactor = 0.5;
+
+  /**
+   * Set PopART graph
+   */
+  void setGraph(const popart::Graph *graph);
+
   /**
    * Register that a poprithms Tensor and a popart Tensor correspond to each
    * other. In addition to registering the Tensor correspondence, the Ops which
@@ -193,11 +209,66 @@ class AliasModel {
    * */
   poprithms::memory::inplace::Graph g;
 
+  /**
+   * The PopART graph reference
+   */
+  popart::Graph *thisGraph = nullptr;
+
 private:
-  std::map<TensorId, PoprithmsTensorId> toTensor_;
-  std::map<PoprithmsTensorId, TensorId> fromTensor_;
-  std::map<OpId, std::vector<poprithms::memory::inplace::OpId>> toOp_;
-  std::map<poprithms::memory::inplace::OpId, OpId> fromOp_;
+  // using a simpler hasher which is faster than std::hash derivatives and that
+  // is safe enough in this case.
+  struct PoprithmsTensorIdSteadyHasher {
+    size_t operator()(const PoprithmsTensorId &id) const {
+      // cast poprithm typed integer (int64_t) to size_t
+      if (id.opId().get() < 0) {
+        throw error("Unexpected negative value!");
+      }
+      return static_cast<size_t>(id.opId().get());
+    }
+  };
+
+  struct PoprithmsOpIdHasher {
+    size_t operator()(const PoprithmsOpId &opid) const {
+      // cast poprithm typed integer (int64_t) to size_t
+      if (opid.get() < 0) {
+        throw error("Unexpected negative value!");
+      }
+      return static_cast<size_t>(opid.get());
+    }
+  };
+
+  // When dealing with medium size of or even more large models,
+  // the number of tensors and ops grow fast. By using std::unordered_map
+  // we can reduce of insertation, query complexities from O(logN)
+  // (N is the number of tensors of ops to deal with) to O(1). The
+  // problem of std::unordered_map is the capcity. The initial capacity of
+  // the container is 8, and once it is at capacity, the container will
+  // request almost doubled memory from kernel (very slow), and copy the old
+  // data into the new memory (slow again). Frequent request memory in a loop
+  // is time consuming. By predicting the memory to be used, we can achieve
+  // exact O(1) complexity for faster compilation.
+  template <size_t GROWTH_FACTOR, typename Key, typename Val, class Hasher>
+  void
+  preAllocAndRehash(typename std::unordered_map<Key, Val, Hasher> &hashTable,
+                    size_t max_slots) {
+    if (hashTable.bucket_count() < max_slots) {
+      hashTable.reserve((hashTable.size() + max_slots) * GROWTH_FACTOR);
+    }
+    {
+      float load_factor = hashTable.size() / hashTable.bucket_count();
+      if (load_factor > AliasModel::loadFactor) {
+        hashTable.rehash((hashTable.size() + max_slots) * GROWTH_FACTOR);
+      }
+    }
+  }
+
+  // Warning: do not iterate over these members as as iterating over unordered
+  // containers introduces non-determinism.
+  std::unordered_map<TensorId, PoprithmsTensorId> toTensor_;
+  std::unordered_map<PoprithmsTensorId, TensorId, PoprithmsTensorIdSteadyHasher>
+      fromTensor_;
+  std::unordered_map<OpId, std::vector<PoprithmsOpId>> toOp_;
+  std::unordered_map<PoprithmsOpId, OpId, PoprithmsOpIdHasher> fromOp_;
 };
 
 } // namespace popart
diff --git a/willow/src/alias/aliasmodel.cpp b/willow/src/alias/aliasmodel.cpp
@@ -1,8 +1,8 @@
 // Copyright (c) 2021 Graphcore Ltd. All rights reserved.
 #include <algorithm>
-#include <map>
 #include <ostream>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 #include <poprithms/common/multiout/ioindices.hpp>
@@ -12,6 +12,7 @@
 #include <poprithms/ndarray/shape.hpp>
 #include <poprithms/util/typedinteger.hpp>
 #include <popart/alias/aliasmodel.hpp>
+#include <popart/graph.hpp>
 #include <popart/ir.hpp>
 
 #include "popart/error.hpp"
@@ -22,12 +23,55 @@
 #include "popart/tensordebuginfo.hpp"
 #include "popart/tensorinfo.hpp"
 
+// note this links to a static variables in an unnamed space
+namespace {
+/**
+ * max initial capacity of tensors
+ */
+constexpr int maxInitTensors = 1500;
+
+/**
+ * max initial capacity of ops
+ */
+constexpr int maxInitOps = 1000;
+
+/**
+ * hash containers of tensors growing factor
+ */
+constexpr int tGrowthFactor = 2;
+
+/**
+ * hash containers of ops growing factor
+ */
+constexpr int opGrowthFactor = 2;
+
+} // namespace
+
 namespace popart {
 
 using PoprithmsTensorId = poprithms::memory::inplace::TensorId;
 using PoprithmsOpId     = poprithms::memory::inplace::OpId;
 
+AliasModel::AliasModel() {
+  toTensor_.reserve(maxInitTensors);
+  fromTensor_.reserve(maxInitTensors);
+  toOp_.reserve(maxInitOps);
+  fromOp_.reserve(maxInitOps);
+}
+
+void AliasModel::setGraph(const popart::Graph *graph) {
+  thisGraph = const_cast<popart::Graph *>(graph);
+}
+
 void AliasModel::insertTensor(const PoprithmsTensorId &id, const Tensor &t) {
+  // as long as we predict memory used to avoid frequent malloc, hash map is
+  // always faster than std::map both in insertation and query. This should
+  // reduce the compilation time.
+  auto &allTensors = thisGraph->getTensors();
+
+  preAllocAndRehash<tGrowthFactor>(toTensor_, allTensors.n());
+  preAllocAndRehash<tGrowthFactor>(fromTensor_, allTensors.n());
+
   toTensor_[t.id] = id;
   fromTensor_[id] = t.id;
   if (t.hasProducer()) {
@@ -40,6 +84,7 @@ void AliasModel::update(OpId oldId, OpId newId) {
   if (found != toOp_.cend()) {
     auto oldTargets = found->second;
     toOp_.erase(found);
+    // found becomes invalidated immediately
     toOp_[newId] = oldTargets;
     for (auto t : oldTargets) {
       fromOp_[t] = newId;
@@ -56,6 +101,11 @@ void AliasModel::insertOp(PoprithmsOpId poprithmsId, OpId id) {
     }
   }
 
+  auto &allOps = thisGraph->getOps();
+
+  preAllocAndRehash<opGrowthFactor>(fromOp_, allOps.size());
+  preAllocAndRehash<opGrowthFactor>(toOp_, allOps.size());
+
   fromOp_[poprithmsId] = id;
   auto found           = toOp_.find(id);
   if (found == toOp_.cend()) {
diff --git a/willow/src/alias/aliasmodelgrower.cpp b/willow/src/alias/aliasmodelgrower.cpp
@@ -49,6 +49,9 @@ void AliasModelGrower::growFullGraph(const Graph &graph,
       graph.getIr().timePartitionLogger().scopedStopwatch(logging::format(
           "Growing full AliasModel for {}", graph.getGraphString()));
 
+  AliasModel &aliasModel = getAliasModelRef();
+  aliasModel.setGraph(&graph);
+
   // NOTE: This loop does not need a schedule that complies with topocons. It
   // may be possible to make this more efficient by getting an Op-order that is
   // only constrained by data order.
@@ -80,6 +83,9 @@ void AliasModelGrower::growPartialGraph(const Graph &graph,
   auto scopedStopwatch = graph.getIr().timePartitionLogger().scopedStopwatch(
       "Growing partial AliasModel");
 
+  AliasModel &aliasModel = getAliasModelRef();
+  aliasModel.setGraph(&graph);
+
   // When growing a AliasModel, by the nature of the Poprithms' API we
   // must grow ops in schedule order. However, we don't know which tensors
   // that precede `tensorId` in the schedule order may be aliasing `tensorId`