Prepare 0.6.0 Release

fknorr · psalz · commit 8341c514069f · 2024-08-12T14:52:30.000+02:00
diff --git a/.hdoc.toml b/.hdoc.toml
@@ -2,7 +2,7 @@
 
 [project]
 name = "Celerity"
-version = "0.5.0"
+version = "0.6.0"
 
 # Optional, adding this will enable direct links from the documentation
 # to your source code.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,28 +6,56 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic
 Versioning](http://semver.org/spec/v2.0.0.html).
 
-## [Unreleased]
+## [0.6.0] - 2024-08-12
+
+This release includes changes that may require adjustments when upgrading:
+- A single Celerity process can now manage multiple devices.
+  This means that on a cluster with 4 GPUs per node, only a single MPI rank needs to be spawned per node.
+- The previous behavior of having a separate process per device is still supported but discouraged, as it incurs additional overhead.
+- It is no longer possible to assign a device to a Celerity process using the `CELERITY_DEVICES` environment variable.
+  Please use vendor-specific mechanisms (such as `CUDA_VISIBLE_DEVICES`) for limiting the set of visible devices instead.
+- We recommend performing a clean build when updating Celerity so that updated submodule dependencies are properly propagated.
 
 We recommend using the following SYCL versions with this release:
 
-- DPC++: ???
+- DPC++: 89327e0a or newer
 - AdaptiveCpp (formerly hipSYCL): v24.06
-- SimSYCL: ???
+- SimSYCL: master
 
 See our [platform support guide](docs/platform-support.md) for a complete list of all officially supported configurations.
 
 ### Added
 
 - Add support for SimSYCL as a SYCL implementation (#238)
 - Extend compiler support to GCC (optionally with sanitizers) and C++20 code bases (#238)
+- `celerity::hints::oversubscribe` can be passed to a command group to increase split granularity and improve computation-communication overlap (#249)
+- Reductions are now unconditionally supported on all SYCL implementations (#265)
 - Add support for profiling with [Tracy](https://github.com/wolfpld/tracy), via `CELERITY_TRACY_SUPPORT` and environment variable `CELERITY_TRACY` (#267)
-- The active SYCL implementation can now be queried via `CELERITY_SYCL_IS_*` macros (#??)
+- The active SYCL implementation can now be queried via `CELERITY_SYCL_IS_*` macros (#277)
 
 ### Changed
 
-- Updated the internal [libenvpp](https://github.com/ph3at/libenvpp) dependency to 1.4.1 and use its new features. (#271)
+- All low-level host / device operations such as memory allocations, copies, and kernel launches are now represented in the single Instruction Graph for improved asynchronicity (#249)
+- Celerity can now maintain multiple disjoint backing allocations per buffer, so disjoint accesses to the same buffer do not trigger bounding-box allocations (#249)
+- The previous implicit size limit of 128 GiB on buffer transfers is lifted (#249, #252)
+- Celerity now manages multiple devices per node / MPI rank. This significantly reduces overhead in multi-GPU setups (#265)
+- Runtime lifetime is extended until destruction of the last queue, buffer, or host object (#265)
+- Host object instances are now destroyed from a runtime background thread instead of the application thread (#265)
+- Collective host tasks in the same collective group continue to execute on the same communicator, but not necessarily on the same background thread anymore (#265)
+- Updated the internal [libenvpp](https://github.com/ph3at/libenvpp) dependency to 1.4.1 and use its new features (#271)
+- Celerity's compile-time feature flags and options are now written to `version.h` instead of being passed on the command line (#277)
+
+### Fixed
+
+- Scheduler tracking structures are now garbage-collected after buffers and host objects go out of scope (#246)
+- The previous requirement to order accessors by access mode is lifted (#265)
+- SYCL reductions to which only some Celerity nodes contribute partial results would read uninitialized data (#265)
+
+### Removed
 
-*Note:* We recommend performing a clean build when updating Celerity so that updated submodule dependencies are properly propagated.
+- Celerity does not attempt to spill device allocations to the host if resizing buffers fails due to an out-of-memory condition (#265)
+- The `CELERITY_DEVICES` environment variable is removed in favor of platform-specific visibility specifiers such as `CUDA_VISIBLE_DEVICES` (#265)
+- The obsolete `experimental::user_benchmarker` infrastructure has been removed (#268).
 
 ## [0.5.0] - 2023-12-21
 
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 The MIT License (MIT)
 
-Copyright (c) 2018-2023 DPS Group, University of Innsbruck, Austria.
+Copyright (c) 2018-2024 DPS Group, University of Innsbruck, Austria.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
@@ -28,13 +28,13 @@ Celerity without much hassle. If you know SYCL already, this will probably
 look very familiar to you:
 
 ```cpp
-celerity::buffer<float> buf{celerity::range<1>{1024}};
-queue.submit([=](celerity::handler& cgh) {
-    celerity::accessor acc{buf, cgh,
-        celerity::access::one_to_one{},               // 1
-        celerity::write_only, celerity::no_init};
-    cgh.parallel_for<class MyKernel>(
-        celerity::range<1>{1024},                     // 2
+celerity::buffer<float> buf(celerity::range(1024));
+queue.submit([&](celerity::handler& cgh) {
+    celerity::accessor acc(buf, cgh,
+        celerity::access::one_to_one(),               // 1
+        celerity::write_only, celerity::no_init);
+    cgh.parallel_for(
+        celerity::range(1024),                        // 2
         [=](celerity::item<1> item) {                 // 3
             acc[item] = sycl::sin(item[0] / 1024.f);  // 4
         });
@@ -128,4 +128,4 @@ Celerity's runtime behavior:
   `fast` for light integration with little runtime overhead, and `full` for
   integration with extensive performance debug information included in the trace.
   Only available if integration was enabled enabled at build time through the
-  CMake option `-DCELERITY_TRACY_SUPPORT=ON`. 
+  CMake option `-DCELERITY_TRACY_SUPPORT=ON`.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.5.0
+0.6.0
diff --git a/examples/convolution/CMakeLists.txt b/examples/convolution/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.13)
 project(convolution LANGUAGES CXX)
 
-find_package(Celerity 0.5.0 REQUIRED)
+find_package(Celerity 0.6.0 REQUIRED)
 
 add_executable(convolution convolution.cc)
 set_property(TARGET convolution PROPERTY CXX_STANDARD ${CELERITY_CXX_STANDARD})
diff --git a/examples/distr_io/CMakeLists.txt b/examples/distr_io/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.13)
 project(distr_io LANGUAGES CXX)
 
-find_package(Celerity 0.5.0 REQUIRED)
+find_package(Celerity 0.6.0 REQUIRED)
 find_package(PkgConfig REQUIRED)
 pkg_search_module(HDF5 REQUIRED IMPORTED_TARGET hdf5-openmpi hdf5-1.12.0 hdf5)
 
diff --git a/examples/hello_world/CMakeLists.txt b/examples/hello_world/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.13)
 project(hello_world LANGUAGES CXX)
 
-find_package(Celerity 0.5.0 REQUIRED)
+find_package(Celerity 0.6.0 REQUIRED)
 
 add_executable(hello_world hello_world.cc)
 set_property(TARGET hello_world PROPERTY CXX_STANDARD ${CELERITY_CXX_STANDARD})
diff --git a/examples/matmul/CMakeLists.txt b/examples/matmul/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.13)
 project(matmul LANGUAGES CXX)
 
-find_package(Celerity 0.5.0 REQUIRED)
+find_package(Celerity 0.6.0 REQUIRED)
 
 add_executable(matmul matmul.cc)
 set_property(TARGET matmul PROPERTY CXX_STANDARD ${CELERITY_CXX_STANDARD})
diff --git a/examples/reduction/CMakeLists.txt b/examples/reduction/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.13)
 project(syncing LANGUAGES CXX)
 
-find_package(Celerity 0.5.0 REQUIRED)
+find_package(Celerity 0.6.0 REQUIRED)
 
 add_executable(reduction reduction.cc)
 set_property(TARGET reduction PROPERTY CXX_STANDARD ${CELERITY_CXX_STANDARD})
diff --git a/examples/syncing/CMakeLists.txt b/examples/syncing/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.13)
 project(syncing LANGUAGES CXX)
 
-find_package(Celerity 0.5.0 REQUIRED)
+find_package(Celerity 0.6.0 REQUIRED)
 
 add_executable(syncing syncing.cc)
 set_property(TARGET syncing PROPERTY CXX_STANDARD ${CELERITY_CXX_STANDARD})
diff --git a/examples/wave_sim/CMakeLists.txt b/examples/wave_sim/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.13)
 project(wave_sim LANGUAGES CXX)
 
-find_package(Celerity 0.5.0 REQUIRED)
+find_package(Celerity 0.6.0 REQUIRED)
 
 add_executable(wave_sim wave_sim.cc)
 set_property(TARGET wave_sim PROPERTY CXX_STANDARD ${CELERITY_CXX_STANDARD})