We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 6e38e60 commit ba255daCopy full SHA for ba255da
src/accumulate/accumulate_nd.jl
@@ -227,7 +227,7 @@ end
227
# We have a block of threads to accumulate along the dims axis; do it in chunks of
228
# block_size and keep track of previous chunks' running prefix
229
ichunk = typeof(iblock)(0)
230
- num_chunks = (length_dims + block_size - 0x1) ÷ block_size
+ num_chunks = (length_dims + (0x2 * block_size) - 0x1) ÷ (0x2 * block_size)
231
total = neutral
232
233
if ithread == 0x0
@@ -326,7 +326,7 @@ end
326
327
# ...and accumulate the last value too
328
if bi == 0x2 * block_size - 0x1
329
- if iblock < num_chunks - 0x1
+ if ichunk < num_chunks - 0x1
330
temp[bi + bank_offset_b + 0x1] = op(t2, v[
331
input_base_idx +
332
((ichunk + 0x1) * block_size * 0x2 - 0x1) * vstrides[dims] +
0 commit comments