Implement buffer locking mechanism

psalz · psalz · commit 2886ae27d027 · 2020-09-04T16:14:25.000+02:00
The buffer manager introduced two issues (documented in the code) that
could cause certain buffer access patterns to become unsafe. This adds a
coarse grained buffer locking mechanism to circumvent these issues.
diff --git a/include/buffer_manager.h b/include/buffer_manager.h
@@ -16,6 +16,7 @@
 #include "ranges.h"
 #include "region_map.h"
 #include "types.h"
+#include <unordered_set>
 
 namespace celerity {
 namespace detail {
@@ -62,7 +63,7 @@ namespace detail {
 	 * Essentially, this means that any requests made to the buffer_manager are assumed to be operations
 	 * that are currently allowed by the command graph.
 	 *
-	 * FIXME: There are two important caveats that we need to deal with:
+	 * There are two important caveats that we need to deal with:
 	 *
 	 * - Reading from a buffer is no longer a const operation, as the buffer may need to be resized.
 	 *   This means that two tasks that could be considered independent on a TDAG basis actually have an
@@ -75,6 +76,11 @@ namespace detail {
 	 *   buffer first with "discard_write" and followed by a "read" should result in a combined "write" mode.
 	 *   However the effect of the discard_write is recorded immediately, and the buffer_manager will thus
 	 *   wrongly assume that no coherence update for the "read" is required.
+	 *
+	 * Currently, these issues are handled through the buffer locking mechanism.
+	 * See buffer_manager::try_lock, buffer_manager::unlock and buffer_manager::is_locked.
+	 *
+	 * FIXME: The current buffer locking mechanism limits task parallelism. Come up with a better solution.
 	 */
 	class buffer_manager {
 	  public:
@@ -104,6 +110,8 @@ namespace detail {
 			cl::sycl::id<Dims> offset;
 		};
 
+		using buffer_lock_id = size_t;
+
 	  public:
 		buffer_manager(device_queue& queue, buffer_lifecycle_callback lifecycle_cb);
 
@@ -212,6 +220,8 @@ namespace detail {
 				}
 			}
 
+			audit_buffer_access(bid, new_buffer.is_allocated(), mode);
+
 			backing_buffer& target_buffer = new_buffer.is_allocated() ? new_buffer : old_buffer;
 			const backing_buffer empty{};
 			const backing_buffer& previous_buffer = new_buffer.is_allocated() ? old_buffer : empty;
@@ -242,6 +252,8 @@ namespace detail {
 				}
 			}
 
+			audit_buffer_access(bid, new_buffer.is_allocated(), mode);
+
 			backing_buffer& target_buffer = new_buffer.is_allocated() ? new_buffer : old_buffer;
 			const backing_buffer empty{};
 			const backing_buffer& previous_buffer = new_buffer.is_allocated() ? old_buffer : empty;
@@ -253,6 +265,32 @@ namespace detail {
 			    id_cast<Dims>(buffers[bid].host_buf.offset)};
 		}
 
+		/**
+		 * @brief Tries to lock the given list of @p buffers using the given lock @p id.
+		 *
+		 * If any of the buffers is currently locked, the locking attempt fails.
+		 *
+		 * Locking is currently an optional (opt-in) mechanism, i.e., buffers can also be
+		 * accessed without being locked. This is because locking is a bit of a band-aid fix
+		 * that doesn't properly cover all use-cases (for example, host-pointer initialized buffers).
+		 *
+		 * However, when accessing a locked buffer, the buffer_manager enforces additional
+		 * rules to ensure they are used in a safe manner for the duration of the lock:
+		 *	- A locked buffer may only be resized at most once, and only for the first access.
+		 *	- A locked buffer may not be accessed using consumer access modes, if it was previously
+		 *	  accessed using a pure producer mode.
+		 *
+		 * @returns Returns true if the list of buffers was successfully locked.
+		 */
+		bool try_lock(buffer_lock_id, const std::unordered_set<buffer_id>& buffers);
+
+		/**
+		 * Unlocks all buffers that were previously locked with a call to try_lock with the given @p id.
+		 */
+		void unlock(buffer_lock_id id);
+
+		bool is_locked(buffer_id bid) const;
+
 	  private:
 		struct backing_buffer {
 			std::unique_ptr<buffer_storage> storage = nullptr;
@@ -302,6 +340,15 @@ namespace detail {
 		struct buffer_type_guard : buffer_type_guard_base {};
 #endif
 
+		struct buffer_lock_info {
+			bool is_locked = false;
+
+			// For lack of a better name, this stores *an* access mode that has already been used during this lock.
+			// While it initially stores whatever is first used to access the buffer, it will always be overwritten
+			// by subsequent pure producer accesses, as those are the only ones we really care about.
+			std::optional<cl::sycl::access::mode> earlier_access_mode = std::nullopt;
+		};
+
 	  private:
 		device_queue& queue;
 		buffer_lifecycle_callback lifecycle_cb;
@@ -312,6 +359,9 @@ namespace detail {
 		std::unordered_map<buffer_id, std::vector<transfer>> scheduled_transfers;
 		std::unordered_map<buffer_id, region_map<data_location>> newest_data_location;
 
+		std::unordered_map<buffer_id, buffer_lock_info> buffer_lock_infos;
+		std::unordered_map<buffer_lock_id, std::vector<buffer_id>> buffer_locks_by_id;
+
 #if !defined(NDEBUG)
 		// Since we store buffers without type information (i.e., its data type and dimensionality),
 		// it is the user's responsibility to only request access to a buffer using the correct type.
@@ -356,6 +406,15 @@ namespace detail {
 		 */
 		void make_buffer_subrange_coherent(buffer_id bid, cl::sycl::access::mode mode, backing_buffer& target_buffer, const subrange<3>& coherent_sr,
 		    const backing_buffer& previous_buffer = backing_buffer{});
+
+		/**
+		 * Checks whether access to a currently locked buffer is safe.
+		 *
+		 * There's two distinct issues that can cause an access to be unsafe:
+		 *	- If a buffer that has been accessed earlier needs to be resized (reallocated) now
+		 *	- If a buffer was previously accessed using a discard_* mode and is now accessed using a consumer mode
+		 */
+		void audit_buffer_access(buffer_id bid, bool requires_allocation, cl::sycl::access::mode mode);
 	};
 
 } // namespace detail
diff --git a/include/executor.h b/include/executor.h
@@ -3,6 +3,7 @@
 #include <chrono>
 #include <thread>
 
+#include "buffer_manager.h"
 #include "buffer_transfer_manager.h"
 #include "logger.h"
 #include "worker_job.h"
@@ -41,7 +42,7 @@ namespace detail {
 	class executor {
 	  public:
 		// TODO: Try to decouple this more.
-		executor(host_queue& h_queue, device_queue& d_queue, task_manager& tm, std::shared_ptr<logger> execution_logger);
+		executor(host_queue& h_queue, device_queue& d_queue, task_manager& tm, buffer_manager& buffer_mngr, std::shared_ptr<logger> execution_logger);
 
 		void startup();
 
@@ -59,6 +60,8 @@ namespace detail {
 		host_queue& h_queue;
 		device_queue& d_queue;
 		task_manager& task_mngr;
+		// FIXME: We currently need this for buffer locking in some jobs, which is a bit of a band-aid fix. Get rid of this at some point.
+		buffer_manager& buffer_mngr;
 		std::unique_ptr<buffer_transfer_manager> btm;
 		std::shared_ptr<logger> execution_logger;
 		std::thread exec_thrd;
diff --git a/include/worker_job.h b/include/worker_job.h
@@ -6,6 +6,7 @@
 #include <limits>
 #include <utility>
 
+#include "buffer_manager.h"
 #include "buffer_transfer_manager.h"
 #include "command.h"
 #include "host_queue.h"
@@ -84,12 +85,14 @@ namespace detail {
 
 	class push_job : public worker_job {
 	  public:
-		push_job(command_pkg pkg, std::shared_ptr<logger> job_logger, buffer_transfer_manager& btm) : worker_job(pkg, job_logger), btm(btm) {
+		push_job(command_pkg pkg, std::shared_ptr<logger> job_logger, buffer_transfer_manager& btm, buffer_manager& bm)
+		    : worker_job(pkg, job_logger), btm(btm), buffer_mngr(bm) {
 			assert(pkg.cmd == command_type::PUSH);
 		}
 
 	  private:
 		buffer_transfer_manager& btm;
+		buffer_manager& buffer_mngr;
 		std::shared_ptr<const buffer_transfer_manager::transfer_handle> data_handle = nullptr;
 
 		bool execute(const command_pkg& pkg, std::shared_ptr<logger> logger) override;
@@ -99,14 +102,15 @@ namespace detail {
 	// host-compute jobs, master-node tasks and collective host tasks
 	class host_execute_job : public worker_job {
 	  public:
-		host_execute_job(command_pkg pkg, std::shared_ptr<logger> job_logger, detail::host_queue& queue, detail::task_manager& tm)
-		    : worker_job(pkg, job_logger), queue(queue), task_mngr(tm) {
+		host_execute_job(command_pkg pkg, std::shared_ptr<logger> job_logger, detail::host_queue& queue, detail::task_manager& tm, buffer_manager& bm)
+		    : worker_job(pkg, job_logger), queue(queue), task_mngr(tm), buffer_mngr(bm) {
 			assert(pkg.cmd == command_type::TASK);
 		}
 
 	  private:
 		detail::host_queue& queue;
 		detail::task_manager& task_mngr;
+		detail::buffer_manager& buffer_mngr;
 		std::future<detail::host_queue::execution_info> future;
 		bool submitted = false;
 
@@ -120,14 +124,15 @@ namespace detail {
 	 */
 	class device_execute_job : public worker_job {
 	  public:
-		device_execute_job(command_pkg pkg, std::shared_ptr<logger> job_logger, detail::device_queue& queue, detail::task_manager& tm)
-		    : worker_job(pkg, job_logger), queue(queue), task_mngr(tm) {
+		device_execute_job(command_pkg pkg, std::shared_ptr<logger> job_logger, detail::device_queue& queue, detail::task_manager& tm, buffer_manager& bm)
+		    : worker_job(pkg, job_logger), queue(queue), task_mngr(tm), buffer_mngr(bm) {
 			assert(pkg.cmd == command_type::TASK);
 		}
 
 	  private:
 		detail::device_queue& queue;
 		detail::task_manager& task_mngr;
+		detail::buffer_manager& buffer_mngr;
 		cl::sycl::event event;
 		bool submitted = false;
 
diff --git a/src/buffer_manager.cc b/src/buffer_manager.cc
@@ -73,6 +73,32 @@ namespace detail {
 		scheduled_transfers[bid].push_back({std::move(data), offset});
 	}
 
+	bool buffer_manager::try_lock(buffer_lock_id id, const std::unordered_set<buffer_id>& buffers) {
+		assert(buffer_locks_by_id.count(id) == 0);
+		for(auto bid : buffers) {
+			if(buffer_lock_infos[bid].is_locked) return false;
+		}
+		buffer_locks_by_id[id].reserve(buffers.size());
+		for(auto bid : buffers) {
+			buffer_lock_infos[bid] = {true, std::nullopt};
+			buffer_locks_by_id[id].push_back(bid);
+		}
+		return true;
+	}
+
+	void buffer_manager::unlock(buffer_lock_id id) {
+		assert(buffer_locks_by_id.count(id) != 0);
+		for(auto bid : buffer_locks_by_id[id]) {
+			buffer_lock_infos[bid] = {};
+		}
+		buffer_locks_by_id.erase(id);
+	}
+
+	bool buffer_manager::is_locked(buffer_id bid) const {
+		if(buffer_lock_infos.count(bid) == 0) return false;
+		return buffer_lock_infos.at(bid).is_locked;
+	}
+
 	// TODO: Something we could look into is to dispatch all memory copies concurrently and wait for them in the end.
 	void buffer_manager::make_buffer_subrange_coherent(
 	    buffer_id bid, cl::sycl::access::mode mode, backing_buffer& target_buffer, const subrange<3>& coherent_sr, const backing_buffer& previous_buffer) {
@@ -218,5 +244,34 @@ namespace detail {
 		if(detail::access::mode_traits::is_producer(mode)) { newest_data_location.at(bid).update_region(coherent_box, target_buffer_location); }
 	}
 
+	void buffer_manager::audit_buffer_access(buffer_id bid, bool requires_allocation, cl::sycl::access::mode mode) {
+		auto& lock_info = buffer_lock_infos[bid];
+
+		// Buffer locking is currently opt-in, so if this buffer isn't locked, we won't check anything else.
+		if(!lock_info.is_locked) return;
+
+		if(lock_info.earlier_access_mode == std::nullopt) {
+			// First access, all good.
+			lock_info.earlier_access_mode = mode;
+			return;
+		}
+
+		if(requires_allocation) {
+			// Re-allocation of a buffer that is currently being accessed never works.
+			throw std::runtime_error("You are requesting multiple accessors for the same buffer, with later ones requiring a larger part of the buffer, "
+			                         "causing a backing buffer reallocation. "
+			                         "This is currently unsupported. Try changing the order of your calls to buffer::get_access.");
+		}
+
+		if(!access::mode_traits::is_consumer(*lock_info.earlier_access_mode) && access::mode_traits::is_consumer(mode)) {
+			// Accessing a buffer using a pure producer mode followed by a consumer mode breaks our coherence bookkeeping.
+			throw std::runtime_error("You are requesting multiple accessors for the same buffer, using a discarding access mode first, followed by a "
+			                         "non-discarding mode. This is currently unsupported. Try changing the order of your calls to buffer::get_access.");
+		}
+
+		// We only need to remember pure producer accesses.
+		if(!access::mode_traits::is_consumer(mode)) { lock_info.earlier_access_mode = mode; }
+	}
+
 } // namespace detail
 } // namespace celerity
diff --git a/src/executor.cc b/src/executor.cc
@@ -22,8 +22,8 @@ namespace detail {
 		running = false;
 	}
 
-	executor::executor(host_queue& h_queue, device_queue& d_queue, task_manager& tm, std::shared_ptr<logger> execution_logger)
-	    : h_queue(h_queue), d_queue(d_queue), task_mngr(tm), execution_logger(execution_logger) {
+	executor::executor(host_queue& h_queue, device_queue& d_queue, task_manager& tm, buffer_manager& buffer_mngr, std::shared_ptr<logger> execution_logger)
+	    : h_queue(h_queue), d_queue(d_queue), task_mngr(tm), buffer_mngr(buffer_mngr), execution_logger(execution_logger) {
 		btm = std::make_unique<buffer_transfer_manager>(execution_logger);
 		metrics.initial_idle.resume();
 	}
@@ -155,7 +155,7 @@ namespace detail {
 	bool executor::handle_command(const command_pkg& pkg, const std::vector<command_id>& dependencies) {
 		switch(pkg.cmd) {
 		case command_type::HORIZON: create_job<horizon_job>(pkg, dependencies); break;
-		case command_type::PUSH: create_job<push_job>(pkg, dependencies, *btm); break;
+		case command_type::PUSH: create_job<push_job>(pkg, dependencies, *btm, buffer_mngr); break;
 		case command_type::AWAIT_PUSH: create_job<await_push_job>(pkg, dependencies, *btm); break;
 		case command_type::TASK: {
 			const auto& data = std::get<task_data>(pkg.data);
@@ -165,9 +165,9 @@ namespace detail {
 
 			auto tsk = task_mngr.get_task(data.tid);
 			if(tsk->get_execution_target() == execution_target::HOST) {
-				create_job<host_execute_job>(pkg, dependencies, h_queue, task_mngr);
+				create_job<host_execute_job>(pkg, dependencies, h_queue, task_mngr, buffer_mngr);
 			} else {
-				create_job<device_execute_job>(pkg, dependencies, d_queue, task_mngr);
+				create_job<device_execute_job>(pkg, dependencies, d_queue, task_mngr, buffer_mngr);
 			}
 			break;
 		}
diff --git a/src/runtime.cc b/src/runtime.cc
@@ -109,7 +109,7 @@ namespace detail {
 			}
 		});
 		task_mngr = std::make_unique<task_manager>(num_nodes, h_queue.get(), is_master);
-		exec = std::make_unique<executor>(*h_queue, *d_queue, *task_mngr, default_logger);
+		exec = std::make_unique<executor>(*h_queue, *d_queue, *task_mngr, *buffer_mngr, default_logger);
 		if(is_master) {
 			cdag = std::make_unique<command_graph>();
 			ggen = std::make_shared<graph_generator>(num_nodes, *task_mngr, *cdag);
diff --git a/src/worker_job.cc b/src/worker_job.cc
@@ -81,10 +81,18 @@ namespace detail {
 
 	bool push_job::execute(const command_pkg& pkg, std::shared_ptr<logger> logger) {
 		if(data_handle == nullptr) {
+			const auto data = std::get<push_data>(pkg.data);
+			// Getting buffer data from the buffer manager may incur a host-side buffer reallocation.
+			// If any other tasks are currently using this buffer for reading, we run into problems.
+			// To avoid this, we use a very crude buffer locking mechanism for now.
+			// FIXME: Get rid of this, replace with finer grained approach.
+			if(buffer_mngr.is_locked(data.bid)) { return false; }
+
 			logger->trace(logger_map({{"event", "Submit buffer to BTM"}}));
 			data_handle = btm.push(pkg);
 			logger->trace(logger_map({{"event", "Buffer submitted to BTM"}}));
 		}
+
 		return data_handle->complete;
 	}
 
@@ -102,6 +110,9 @@ namespace detail {
 		if(!submitted) {
 			auto tsk = task_mngr.get_task(data.tid);
 			assert(tsk->get_execution_target() == execution_target::HOST);
+
+			if(!buffer_mngr.try_lock(pkg.cid, tsk->get_buffer_access_map().get_accessed_buffers())) { return false; }
+
 			logger->trace(logger_map({{"event", "Execute live-pass, scheduling host task in thread pool"}}));
 
 			// Note that for host tasks, there is no indirection through a queue->submit step like there is for SYCL tasks. The CGF is executed directly,
@@ -118,6 +129,8 @@ namespace detail {
 
 		assert(future.valid());
 		if(future.wait_for(std::chrono::seconds(0)) == std::future_status::ready) {
+			buffer_mngr.unlock(pkg.cid);
+
 			auto info = future.get();
 			logger->trace(logger_map({{"event", fmt::format("Delta time submit -> start: {}us",
 			                                        std::chrono::duration_cast<std::chrono::microseconds>(info.start_time - info.submit_time).count())}}));
@@ -153,6 +166,9 @@ namespace detail {
 		if(!submitted) {
 			auto tsk = task_mngr.get_task(data.tid);
 			assert(tsk->get_execution_target() == execution_target::DEVICE);
+
+			if(!buffer_mngr.try_lock(pkg.cid, tsk->get_buffer_access_map().get_accessed_buffers())) { return false; }
+
 			logger->trace(logger_map({{"event", "Execute live-pass, submit kernel to SYCL"}}));
 
 			event = queue.submit([tsk, sr = data.sr](cl::sycl::handler& handler, size_t forced_work_group_size) {
@@ -167,6 +183,8 @@ namespace detail {
 
 		const auto status = event.get_info<cl::sycl::info::event::command_execution_status>();
 		if(status == cl::sycl::info::event_command_status::complete) {
+			buffer_mngr.unlock(pkg.cid);
+
 #if !WORKAROUND(HIPSYCL, 0)
 			if(queue.is_profiling_enabled()) {
 				const auto queued = get_profiling_info(event.get(), CL_PROFILING_COMMAND_QUEUED);
diff --git a/test/runtime_tests.cc b/test/runtime_tests.cc

Original file line number	Diff line number	Diff line change
`@@ -109,7 +109,7 @@ namespace detail {`
`109`	`109`	`}`
`110`	`110`	`});`
`111`	`111`	`task_mngr = std::make_unique<task_manager>(num_nodes, h_queue.get(), is_master);`
`112`		`- exec = std::make_unique<executor>(h_queue, d_queue, *task_mngr, default_logger);`
	`112`	`+ exec = std::make_unique<executor>(h_queue, d_queue, task_mngr, buffer_mngr, default_logger);`
`113`	`113`	`if(is_master) {`
`114`	`114`	`cdag = std::make_unique<command_graph>();`
`115`	`115`	`ggen = std::make_shared<graph_generator>(num_nodes, task_mngr, cdag);`