Skip to content

Commit fc99855

Browse files
committed
Initialize queue and pass workgroupsize
1 parent 63b2e78 commit fc99855

File tree

2 files changed

+17
-7
lines changed

2 files changed

+17
-7
lines changed

src/kernels.jl

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,17 +33,29 @@ end
3333
particle
3434
end
3535

36-
@kernel function update_particle_states!(prob, gpu_particles::AbstractArray{SPSOParticle{T1,T2}}, gbest_ref, w,
36+
@kernel function update_particle_states!(prob,
37+
gpu_particles::AbstractArray{SPSOParticle{T1, T2}}, gbest_ref, w,
3738
opt::ParallelPSOKernel, lock; c1 = 1.4962f0,
38-
c2 = 1.4962f0) where {T1,T2}
39+
c2 = 1.4962f0) where {T1, T2}
3940
i = @index(Global, Linear)
4041
# FIXME: Determine the right amount of shmem to use
41-
best_queue = @localmem SPSOGBest{T1,T2} 1024
42+
43+
@uniform gs = @groupsize()[1]
44+
45+
best_queue = @localmem SPSOGBest{T1, T2} (gs)
4246
queue_num = @localmem UInt32 1
4347

4448
@inbounds gbest = gbest_ref[1]
4549
@inbounds particle = gpu_particles[i]
4650

51+
# Initialize cost to be Inf
52+
for bq_idx in 1:gs
53+
best_queue[bq_idx] = SPSOGBest(particle.best_position,
54+
convert(typeof(particle.cost), Inf))
55+
end
56+
57+
@synchronize
58+
4759
particle = update_particle_state(particle, prob, gbest, w, c1, c2, i, opt)
4860
@inbounds gpu_particles[i] = particle
4961

@@ -52,11 +64,9 @@ end
5264
if particle.best_cost < gbest.cost
5365
queue_idx = @atomic queue_num[1] += UInt32(1)
5466
@inbounds best_queue[queue_idx] = SPSOGBest(particle.best_position,
55-
particle.best_cost)
67+
particle.best_cost)
5668
end
57-
5869
@synchronize
59-
6070
if i <= first(@ndrange())
6171
tidx = @index(Local, Linear)
6272
if tidx == 1

src/lowerlevel_solve.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ function vectorized_solve!(prob,
3535

3636
backend = get_backend(gpu_particles)
3737

38-
kernel = update_particle_states!(backend)
38+
kernel = update_particle_states!(backend, 1024)
3939

4040
lock = KernelAbstractions.allocate(backend, UInt32, 1)
4141
fill!(lock, UInt32(0))

0 commit comments

Comments
 (0)