Skip to content

Commit e444051

Browse files
committed
Do a small beauty pass on examples
Replace `cl::sycl` with `sycl`, fix minor issues, improve naming consistency, reduce noise and in particular get rid of explicit MPI interactions.
1 parent e231e58 commit e444051

File tree

6 files changed

+122
-152
lines changed

6 files changed

+122
-152
lines changed

examples/convolution/convolution.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ int main(int argc, char* argv[]) {
1919
return EXIT_FAILURE;
2020
}
2121

22-
std::vector<cl::sycl::float3> image_input;
22+
std::vector<sycl::float3> image_input;
2323
int image_width = 0, image_height = 0, image_channels = 0;
2424
{
2525
uint8_t* image_data = stbi_load(argv[1], &image_width, &image_height, &image_channels, 3);
@@ -50,8 +50,8 @@ int main(int argc, char* argv[]) {
5050

5151
celerity::distr_queue queue;
5252

53-
celerity::buffer<cl::sycl::float3, 2> image_input_buf(image_input.data(), celerity::range<2>(image_height, image_width));
54-
celerity::buffer<cl::sycl::float3, 2> image_tmp_buf(celerity::range<2>(image_height, image_width));
53+
celerity::buffer<sycl::float3, 2> image_input_buf(image_input.data(), celerity::range<2>(image_height, image_width));
54+
celerity::buffer<sycl::float3, 2> image_tmp_buf(celerity::range<2>(image_height, image_width));
5555

5656
celerity::buffer<float, 2> gaussian_mat_buf(gaussian_matrix.data(), celerity::range<2>(FILTER_SIZE, FILTER_SIZE));
5757

@@ -62,7 +62,7 @@ int main(int argc, char* argv[]) {
6262
celerity::accessor out{image_tmp_buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
6363

6464
cgh.parallel_for<class gaussian_blur>(celerity::range<2>(image_height, image_width), [=, fs = FILTER_SIZE](celerity::item<2> item) {
65-
using cl::sycl::float3;
65+
using sycl::float3;
6666
if(is_on_boundary(celerity::range<2>(image_height, image_width), fs, item)) {
6767
out[item] = float3(0.f, 0.f, 0.f);
6868
return;
@@ -78,15 +78,15 @@ int main(int argc, char* argv[]) {
7878
});
7979
});
8080

81-
celerity::buffer<cl::sycl::float3, 2> image_output_buf(celerity::range<2>(image_height, image_width));
81+
celerity::buffer<sycl::float3, 2> image_output_buf(celerity::range<2>(image_height, image_width));
8282

8383
// Now apply a sharpening kernel
8484
queue.submit([=](celerity::handler& cgh) {
8585
celerity::accessor in{image_tmp_buf, cgh, celerity::access::neighborhood{1, 1}, celerity::read_only};
8686
celerity::accessor out{image_output_buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
8787

8888
cgh.parallel_for<class sharpen>(celerity::range<2>(image_height, image_width), [=, fs = FILTER_SIZE](celerity::item<2> item) {
89-
using cl::sycl::float3;
89+
using sycl::float3;
9090
if(is_on_boundary(celerity::range<2>(image_height, image_width), fs, item)) {
9191
out[item] = float3(0.f, 0.f, 0.f);
9292
return;

examples/distr_io/distr_io.cc

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -94,11 +94,6 @@ celerity::subrange<2> transposed(celerity::chunk<2> chnk) {
9494
int main(int argc, char* argv[]) {
9595
const size_t N = 1000;
9696

97-
celerity::detail::runtime::init(&argc, &argv);
98-
99-
int rank = 1;
100-
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
101-
10297
if((argc == 3 || argc == 4) && strcmp(argv[1], "--generate") == 0) {
10398
std::vector<float> initial(N * N);
10499
unsigned long seed = 1234567890;
@@ -110,7 +105,10 @@ int main(int argc, char* argv[]) {
110105

111106
celerity::distr_queue q;
112107
write_hdf5_file(q, out, argv[2]);
113-
} else if(argc == 4 && strcmp(argv[1], "--transpose") == 0) {
108+
return EXIT_SUCCESS;
109+
}
110+
111+
if(argc == 4 && strcmp(argv[1], "--transpose") == 0) {
114112
celerity::buffer<float, 2> in(celerity::range<2>{N, N});
115113
celerity::buffer<float, 2> out(celerity::range<2>{N, N});
116114

@@ -128,7 +126,10 @@ int main(int argc, char* argv[]) {
128126
});
129127

130128
write_hdf5_file(q, out, argv[3]);
131-
} else if(argc == 4 && strcmp(argv[1], "--compare") == 0) {
129+
return EXIT_SUCCESS;
130+
}
131+
132+
if(argc == 4 && strcmp(argv[1], "--compare") == 0) {
132133
bool equal = true;
133134
{
134135
celerity::distr_queue q;
@@ -148,17 +149,17 @@ int main(int argc, char* argv[]) {
148149
equal &= a[{i, j}] == b[{i, j}];
149150
}
150151
}
152+
fprintf(stderr, "=> Files are %sequal\n", equal ? "" : "NOT ");
151153
});
152154
});
153155
}
154-
155-
if(rank == 0) { fprintf(stderr, "=> Files are %sequal\n", equal ? "" : "NOT "); }
156-
} else {
157-
fprintf(stderr,
158-
"Usage: %s --generate <out-file> to generate random data\n"
159-
" %s --transpose <in-file> <out-file> to transpose\n"
160-
" %s --compare <in-file> <out-file> to compare for equality\n",
161-
argv[0], argv[0], argv[0]);
162-
return EXIT_FAILURE;
156+
return equal ? EXIT_SUCCESS : EXIT_FAILURE;
163157
}
158+
159+
fprintf(stderr,
160+
"Usage: %s --generate <out-file> to generate random data\n"
161+
" %s --transpose <in-file> <out-file> to transpose\n"
162+
" %s --compare <in-file> <out-file> to compare for equality\n",
163+
argv[0], argv[0], argv[0]);
164+
return EXIT_FAILURE;
164165
}

examples/matmul/matmul.cc

Lines changed: 30 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -63,53 +63,39 @@ void multiply(celerity::distr_queue queue, celerity::buffer<T, 2> mat_a, celerit
6363
int main(int argc, char* argv[]) {
6464
bool verification_passed = true;
6565

66-
celerity::runtime::init(&argc, &argv);
67-
68-
int rank;
69-
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
70-
71-
celerity::experimental::bench::log_user_config({{"matSize", std::to_string(MAT_SIZE)}});
72-
73-
{
74-
celerity::distr_queue queue;
75-
76-
auto range = celerity::range<2>(MAT_SIZE, MAT_SIZE);
77-
celerity::buffer<float, 2> mat_a_buf(range);
78-
celerity::buffer<float, 2> mat_b_buf(range);
79-
celerity::buffer<float, 2> mat_c_buf(range);
80-
81-
set_identity(queue, mat_a_buf);
82-
set_identity(queue, mat_b_buf);
83-
84-
celerity::experimental::bench::begin("main program");
85-
86-
multiply(queue, mat_a_buf, mat_b_buf, mat_c_buf);
87-
multiply(queue, mat_b_buf, mat_c_buf, mat_a_buf);
88-
89-
queue.submit(celerity::allow_by_ref, [&](celerity::handler& cgh) {
90-
celerity::accessor result{mat_a_buf, cgh, celerity::access::one_to_one{}, celerity::read_only_host_task};
91-
92-
cgh.host_task(range, [=, &verification_passed](celerity::partition<2> part) {
93-
celerity::experimental::bench::end("main program");
94-
95-
auto sr = part.get_subrange();
96-
for(size_t i = sr.offset[0]; i < sr.offset[0] + sr.range[0]; ++i) {
97-
for(size_t j = sr.offset[0]; j < sr.offset[0] + sr.range[0]; ++j) {
98-
const float kernel_value = result[{i, j}];
99-
const float host_value = i == j;
100-
if(kernel_value != host_value) {
101-
fprintf(stderr, "rank %d: VERIFICATION FAILED for element %zu,%zu: %f (received) != %f (expected)\n", rank, i, j, kernel_value,
102-
host_value);
103-
verification_passed = false;
104-
break;
105-
}
66+
celerity::distr_queue queue;
67+
68+
auto range = celerity::range<2>(MAT_SIZE, MAT_SIZE);
69+
celerity::buffer<float, 2> mat_a_buf(range);
70+
celerity::buffer<float, 2> mat_b_buf(range);
71+
celerity::buffer<float, 2> mat_c_buf(range);
72+
73+
set_identity(queue, mat_a_buf);
74+
set_identity(queue, mat_b_buf);
75+
76+
multiply(queue, mat_a_buf, mat_b_buf, mat_c_buf);
77+
multiply(queue, mat_b_buf, mat_c_buf, mat_a_buf);
78+
79+
queue.submit(celerity::allow_by_ref, [&](celerity::handler& cgh) {
80+
celerity::accessor result{mat_a_buf, cgh, celerity::access::one_to_one{}, celerity::read_only_host_task};
81+
82+
cgh.host_task(range, [=, &verification_passed](celerity::partition<2> part) {
83+
auto sr = part.get_subrange();
84+
for(size_t i = sr.offset[0]; i < sr.offset[0] + sr.range[0]; ++i) {
85+
for(size_t j = sr.offset[0]; j < sr.offset[0] + sr.range[0]; ++j) {
86+
const float received = result[{i, j}];
87+
const float expected = float(i == j);
88+
if(expected != received) {
89+
fprintf(stderr, "VERIFICATION FAILED for element %zu,%zu: %f (received) != %f (expected)\n", i, j, received, expected);
90+
verification_passed = false;
91+
break;
10692
}
107-
if(!verification_passed) { break; }
10893
}
109-
if(verification_passed) { printf("rank %d: VERIFICATION PASSED!\n", rank); }
110-
});
94+
if(!verification_passed) { break; }
95+
}
96+
if(verification_passed) { printf("VERIFICATION PASSED!\n"); }
11197
});
112-
}
98+
});
11399

114100
return verification_passed ? EXIT_SUCCESS : EXIT_FAILURE;
115101
}

examples/reduction/reduction.cc

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,31 +9,31 @@
99
#include <stb/stb_image_write.h>
1010

1111

12-
cl::sycl::float4 srgb_to_rgb(cl::sycl::float4 srgb) {
12+
sycl::float4 srgb_to_rgb(sycl::float4 srgb) {
1313
const auto linearize = [](float u) {
1414
if(u <= 0.04045f) {
1515
return u / 12.92f;
1616
} else {
17-
return cl::sycl::pow((u + 0.055f) / 1.055f, 2.4f);
17+
return sycl::pow((u + 0.055f) / 1.055f, 2.4f);
1818
}
1919
};
20-
return cl::sycl::float4{
20+
return sycl::float4{
2121
linearize(srgb.r()),
2222
linearize(srgb.g()),
2323
linearize(srgb.b()),
2424
0,
2525
};
2626
}
2727

28-
cl::sycl::float4 rgb_to_srgb(cl::sycl::float4 linear) {
28+
sycl::float4 rgb_to_srgb(sycl::float4 linear) {
2929
const auto compress = [](float u) {
3030
if(u <= 0.0031308f) {
3131
return 12.92f * u;
3232
} else {
33-
return 1.055f * cl::sycl::pow(u, 1.f / 2.4f) - 0.055f;
33+
return 1.055f * sycl::pow(u, 1.f / 2.4f) - 0.055f;
3434
}
3535
};
36-
return cl::sycl::float4{
36+
return sycl::float4{
3737
compress(linear.r()),
3838
compress(linear.g()),
3939
compress(linear.b()),
@@ -44,11 +44,11 @@ cl::sycl::float4 rgb_to_srgb(cl::sycl::float4 linear) {
4444

4545
// We could use two reduction variables to calculate minimum and maximum, but some SYCL implementations currently only support a single reductio per kernel.
4646
// Instead we build a combined minimum-maximum operation, with the side effect that we have to call `combine(x, x)` instead of `combine(x)` below.
47-
const auto minmax = [](cl::sycl::float2 a, cl::sycl::float2 b) { //
48-
return cl::sycl::float2{cl::sycl::min(a[0], b[0]), cl::sycl::max(a[1], b[1])};
47+
const auto minmax = [](sycl::float2 a, sycl::float2 b) { //
48+
return sycl::float2{sycl::min(a[0], b[0]), sycl::max(a[1], b[1])};
4949
};
5050

51-
const cl::sycl::float2 minmax_identity{INFINITY, -INFINITY};
51+
const sycl::float2 minmax_identity{INFINITY, -INFINITY};
5252

5353

5454
// Reads an image, finds minimum/maximum pixel values, stretches the histogram to increase contrast, and saves the resulting image to output.jpg.
@@ -65,9 +65,9 @@ int main(int argc, char* argv[]) {
6565
celerity::distr_queue q;
6666

6767
celerity::range<2> image_size{static_cast<size_t>(image_height), static_cast<size_t>(image_width)};
68-
celerity::buffer<cl::sycl::uchar4, 2> srgb_255_buf{reinterpret_cast<const cl::sycl::uchar4*>(srgb_255_data.get()), image_size};
69-
celerity::buffer<cl::sycl::float4, 2> lab_buf{image_size};
70-
celerity::buffer<cl::sycl::float2, 1> minmax_buf{celerity::range{1}};
68+
celerity::buffer<sycl::uchar4, 2> srgb_255_buf{reinterpret_cast<const sycl::uchar4*>(srgb_255_data.get()), image_size};
69+
celerity::buffer<sycl::float4, 2> lab_buf{image_size};
70+
celerity::buffer<sycl::float2, 1> minmax_buf{celerity::range{1}};
7171

7272
q.submit([=](celerity::handler& cgh) {
7373
celerity::accessor srgb_255_acc{srgb_255_buf, cgh, celerity::access::one_to_one{}, celerity::read_only};
@@ -97,7 +97,7 @@ int main(int argc, char* argv[]) {
9797
for(int i = 0; i < 3; ++i) {
9898
rgb[i] = (rgb[i] - min) / (max - min);
9999
}
100-
srgb_255_acc[item] = cl::sycl::round((rgb_to_srgb(rgb) * 255.0f)).convert<unsigned char>();
100+
srgb_255_acc[item] = sycl::round((rgb_to_srgb(rgb) * 255.0f)).convert<unsigned char>();
101101
});
102102
});
103103

examples/syncing/syncing.cc

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,40 @@
1-
#include <cstdio>
21
#include <vector>
32

43
#include <celerity.h>
54

6-
using namespace celerity;
7-
85
int main(int argc, char* argv[]) {
9-
constexpr int N = 10;
6+
constexpr size_t buf_size = 512;
107

11-
celerity::distr_queue q;
12-
celerity::buffer<int, 1> buff(N);
13-
std::vector<int> host_buff(N);
8+
celerity::distr_queue queue;
9+
celerity::buffer<size_t, 1> buf(buf_size);
1410

15-
q.submit([=](handler& cgh) {
16-
celerity::accessor b{buff, cgh, access::one_to_one{}, celerity::write_only, celerity::no_init};
17-
cgh.parallel_for<class mat_mul>(celerity::range<1>(N), [=](celerity::item<1> item) { b[item] = item.get_linear_id(); });
11+
// Initialize buffer in a distributed device kernel
12+
queue.submit([=](celerity::handler& cgh) {
13+
celerity::accessor b{buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
14+
cgh.parallel_for<class write_linear_id>(buf.get_range(), [=](celerity::item<1> item) { b[item] = item.get_linear_id(); });
1815
});
1916

20-
q.submit(celerity::allow_by_ref, [=, &host_buff](handler& cgh) {
21-
celerity::accessor b{buff, cgh, access::all{}, celerity::read_only_host_task};
22-
cgh.host_task(on_master_node, [=, &host_buff] {
23-
std::this_thread::sleep_for(std::chrono::milliseconds(10)); // give the synchronization more time to fail
24-
for(int i = 0; i < N; i++) {
25-
host_buff[i] = b[i];
17+
// Process values on the host
18+
std::vector<size_t> host_buf(buf_size);
19+
queue.submit(celerity::allow_by_ref, [=, &host_buf](celerity::handler& cgh) {
20+
celerity::accessor b{buf, cgh, celerity::access::all{}, celerity::read_only_host_task};
21+
cgh.host_task(celerity::experimental::collective, [=, &host_buf](celerity::experimental::collective_partition) {
22+
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // give the synchronization more time to fail
23+
for(size_t i = 0; i < buf_size; i++) {
24+
host_buf[i] = 2 * b[i];
2625
}
2726
});
2827
});
2928

30-
q.slow_full_sync();
29+
// Wait until both tasks have completed
30+
queue.slow_full_sync();
3131

32-
int rank = 1;
33-
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
32+
// At this point we can safely interact with host_buf from within the main thread
3433
bool valid = true;
35-
if(rank == 0) {
36-
for(int i = 0; i < N; i++) {
37-
if(host_buff[i] != i) valid = false;
34+
for(size_t i = 0; i < buf_size; i++) {
35+
if(host_buf[i] != 2 * i) {
36+
valid = false;
37+
break;
3838
}
3939
}
4040

0 commit comments

Comments
 (0)