|
| 1 | +#include <assert.h> |
| 2 | +#include <stdio.h> |
| 3 | + |
| 4 | +#define CEIL_DIV(x, y) ((x + y - 1) / y) |
| 5 | + |
| 6 | +#define gpuErrChk(ans) \ |
| 7 | + { \ |
| 8 | + gpuAssert((ans), __FILE__, __LINE__); \ |
| 9 | + } |
| 10 | +void gpuAssert(cudaError_t code, const char *file, int line) { |
| 11 | + if (code != cudaSuccess) { |
| 12 | + fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, |
| 13 | + line); |
| 14 | + } |
| 15 | +} |
| 16 | + |
| 17 | +__device__ int2 divide_work(int n_jobs, int n_workers, int worker_idx) { |
| 18 | + // Each worker will do a continuous slice of either n_jobs / n_workers |
| 19 | + // or ceil_div(n_jobs, n_workers). The return value is an int2 representing |
| 20 | + // a half open interval of jobs for the worker to perform (perform jobs |
| 21 | + // i for a <= i < b) |
| 22 | + |
| 23 | + int cd = CEIL_DIV(n_jobs, n_workers); |
| 24 | + int d = n_jobs / n_workers; |
| 25 | + |
| 26 | + int doing_cd = n_jobs % n_workers; |
| 27 | + |
| 28 | + int2 retval; |
| 29 | + if (worker_idx < doing_cd) { |
| 30 | + retval.x = worker_idx * cd; |
| 31 | + retval.y = retval.x + cd; |
| 32 | + } else { |
| 33 | + retval.x = doing_cd * cd + (worker_idx - doing_cd) * d; |
| 34 | + retval.y = retval.x + d; |
| 35 | + } |
| 36 | + |
| 37 | + return retval; |
| 38 | +} |
| 39 | + |
| 40 | +__device__ int2 compute_warp_start_stop(int block_idx, int warp_idx, |
| 41 | + int n_blocks, int n_steps) { |
| 42 | + int2 block_ss = divide_work(n_steps, n_blocks, block_idx); |
| 43 | + int block_start = block_ss.x; |
| 44 | + int block_stop = block_ss.y; |
| 45 | + int block_jobs = block_stop - block_start; |
| 46 | + |
| 47 | + int2 warp_ss = divide_work(block_jobs, 32, warp_idx); |
| 48 | + int warp_start = block_start + warp_ss.x; |
| 49 | + int warp_stop = block_start + warp_ss.y; |
| 50 | + |
| 51 | + int2 retval; |
| 52 | + retval.x = warp_start; |
| 53 | + retval.y = warp_stop; |
| 54 | + return retval; |
| 55 | +} |
| 56 | + |
| 57 | +// decay storage, h_storage: |
| 58 | +// each a n_dims x 33 x n_blocks matrix on GPU with 33rd column for block |
| 59 | +// reduction |
| 60 | +__global__ void reduction_kernel(float *decays, float *impulses, |
| 61 | + float *initial_state, float *_decay_storage, |
| 62 | + float *_h_storage, int n_dims, int n_steps) { |
| 63 | + int warp = threadIdx.x / 32; |
| 64 | + int lane = threadIdx.x % 32; |
| 65 | + |
| 66 | + float *decay_storage = &_decay_storage[blockIdx.x * 33 * n_dims]; |
| 67 | + float *h_storage = &_h_storage[blockIdx.x * 33 * n_dims]; |
| 68 | + |
| 69 | + int2 start_stop = |
| 70 | + compute_warp_start_stop(blockIdx.x, warp, gridDim.x, n_steps); |
| 71 | + int warp_start = start_stop.x; |
| 72 | + int warp_stop = start_stop.y; |
| 73 | + |
| 74 | + /* |
| 75 | + * Reduce within warps. |
| 76 | + * After this loop exits, the storage arrays should contain the reduction |
| 77 | + * from warp_start to warp_stop (including initial state) at index |
| 78 | + * (feature_idx, warp, block). |
| 79 | + */ |
| 80 | + for (int i = lane; i < n_dims; i += 32) { |
| 81 | + float cum_decay = 1.0; |
| 82 | + float h = 0.0; |
| 83 | + if (blockIdx.x == 0 && warp == 0 && initial_state != NULL) { |
| 84 | + h = initial_state[i]; |
| 85 | + } |
| 86 | + |
| 87 | + for (int t = warp_start; t < warp_stop; t++) { |
| 88 | + cum_decay *= decays[i + t * n_dims]; |
| 89 | + h = decays[i + t * n_dims] * h + impulses[i + t * n_dims]; |
| 90 | + } |
| 91 | + |
| 92 | + // TODO: store into shared memory, work in shared memory sized blocks |
| 93 | + // store into global memory |
| 94 | + decay_storage[i + warp * n_dims] = cum_decay; |
| 95 | + h_storage[i + warp * n_dims] = h; |
| 96 | + } |
| 97 | + |
| 98 | + __syncthreads(); |
| 99 | + |
| 100 | + /* |
| 101 | + * Reduce over warps. |
| 102 | + * After this loop exits, the storage arrays should contain the reduction |
| 103 | + * from block_start to block_finish (including initial state) at index |
| 104 | + * (feature_idx, 32, block). |
| 105 | + */ |
| 106 | + // TODO: parallel reduction (or scan). Need to worry about changing the warp |
| 107 | + // reduction values (as I use them again later) |
| 108 | + for (int i = lane + 32 * warp; i < n_dims; i += blockDim.x) { |
| 109 | + float cum_decay = 1.0; |
| 110 | + float h = 0.0; |
| 111 | + for (int t = 0; t < 32; t++) { |
| 112 | + cum_decay *= decay_storage[i + t * n_dims]; |
| 113 | + h = decay_storage[i + t * n_dims] * h + h_storage[i + t * n_dims]; |
| 114 | + } |
| 115 | + decay_storage[i + 32 * n_dims] = cum_decay; |
| 116 | + h_storage[i + 32 * n_dims] = h; |
| 117 | + } |
| 118 | +} |
| 119 | + |
| 120 | +__global__ void block_scan_kernel(float *decay_storage, float *h_storage, |
| 121 | + int n_dims, int n_blocks) { |
| 122 | + /* |
| 123 | + * Scan over blocks. |
| 124 | + * After this loop exits, the storage arrays should contain the cumulative |
| 125 | + * sum from block_idx 0 to i (inclusive) at index (feature_idx, 32, i) This |
| 126 | + * means (feature_idx, 32, 2) contains the reduction of blocks 0, 1, and 2. |
| 127 | + */ |
| 128 | + // TODO: parallel scan (tricky because number of blocks isn't necessarily |
| 129 | + // smaller than number of warps that can fit in a single block) |
| 130 | + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n_dims; |
| 131 | + i += blockDim.x * gridDim.x) { |
| 132 | + for (int t = 1; t < n_blocks; t++) { |
| 133 | + int cur_idx = i + 32 * n_dims + t * 33 * n_dims; |
| 134 | + int prev_idx = i + 32 * n_dims + (t - 1) * 33 * n_dims; |
| 135 | + |
| 136 | + // TODO: remove unneccessary reads from global memory (prev_idx |
| 137 | + // accesses) |
| 138 | + h_storage[cur_idx] = decay_storage[cur_idx] * h_storage[prev_idx] + |
| 139 | + h_storage[cur_idx]; |
| 140 | + decay_storage[cur_idx] *= decay_storage[prev_idx]; |
| 141 | + } |
| 142 | + } |
| 143 | +} |
| 144 | + |
| 145 | +__global__ void warp_scan_kernel(float *decays, float *impulses, |
| 146 | + float *initial_state, float *out, |
| 147 | + float *decay_storage, float *h_storage, |
| 148 | + int n_dims, int n_steps) { |
| 149 | + int warp = threadIdx.x / 32; |
| 150 | + int lane = threadIdx.x % 32; |
| 151 | + |
| 152 | + // Note: Due to the index ordering of the storage arrays, the following |
| 153 | + // indices are equivalent: |
| 154 | + // |
| 155 | + // i + (t - 1) * n_dims + blockIdx.x * 33 * n_dims |
| 156 | + // i + 32 * n_dims + (blockIdx.x - 1) * 33 * n_dims |
| 157 | + // |
| 158 | + // when t is 0. This means something that looks like negative indexing |
| 159 | + // (t-1) can be used to safely access the stored value for the previous |
| 160 | + // warp (even if the previous warp belonged to the previous block). |
| 161 | + |
| 162 | + /* |
| 163 | + * Scan over warps. |
| 164 | + * After this loop executes, the storage arrays should contain the |
| 165 | + * cumulative sum from the beginning of sequence (including initial |
| 166 | + * condition) up to and including the indexed warp and block. |
| 167 | + */ |
| 168 | + // TODO: parallel scan |
| 169 | + for (int i = lane + 32 * warp; i < n_dims; i += blockDim.x) { |
| 170 | + for (int t = 0; t < 32; t++) { |
| 171 | + if (t == 0 && blockIdx.x == 0) { |
| 172 | + // the reduction over warp 0 (including initial condition) is |
| 173 | + // correct val for scan, so there's no work to do |
| 174 | + continue; |
| 175 | + } |
| 176 | + |
| 177 | + int cur_idx = i + t * n_dims + blockIdx.x * 33 * n_dims; |
| 178 | + int prev_idx = i + (t - 1) * n_dims + blockIdx.x * 33 * n_dims; |
| 179 | + h_storage[cur_idx] = decay_storage[cur_idx] * h_storage[prev_idx] + |
| 180 | + h_storage[cur_idx]; |
| 181 | + decay_storage[cur_idx] *= decay_storage[prev_idx]; |
| 182 | + } |
| 183 | + } |
| 184 | + |
| 185 | + __syncthreads(); |
| 186 | + |
| 187 | + int2 start_stop = |
| 188 | + compute_warp_start_stop(blockIdx.x, warp, gridDim.x, n_steps); |
| 189 | + int warp_start = start_stop.x; |
| 190 | + int warp_stop = start_stop.y; |
| 191 | + |
| 192 | + /* |
| 193 | + * Scan within warps. |
| 194 | + * This loop writes to the output array. Each warp reads in it's initial |
| 195 | + * state (either from the "initial_state" or the storage arrays) and then |
| 196 | + * writes to output for indices warp_start up to warp_stop. |
| 197 | + */ |
| 198 | + for (int i = lane; i < n_dims; i += 32) { |
| 199 | + float h = 0.0; |
| 200 | + if (blockIdx.x == 0 && warp == 0) { |
| 201 | + if (initial_state != NULL) { |
| 202 | + h = initial_state[i]; |
| 203 | + } |
| 204 | + } else { |
| 205 | + h = h_storage[i + (warp - 1) * n_dims + blockIdx.x * 33 * n_dims]; |
| 206 | + } |
| 207 | + |
| 208 | + for (int t = warp_start; t < warp_stop; t++) { |
| 209 | + h = decays[i + t * n_dims] * h + impulses[i + t * n_dims]; |
| 210 | + out[i + t * n_dims] = h; |
| 211 | + } |
| 212 | + } |
| 213 | +} |
| 214 | + |
| 215 | +__global__ void serial_linear_recurrence(float *decays, float *impulses, |
| 216 | + float *initial_state, float *out, |
| 217 | + int n_dims, int n_steps) { |
| 218 | + // computes h_t = lambda_t h{t-1} + x_t |
| 219 | + |
| 220 | + for (int dim_idx = threadIdx.x + blockIdx.x * blockDim.x; dim_idx < n_dims; |
| 221 | + dim_idx += blockDim.x * gridDim.x) { |
| 222 | + float val = initial_state[dim_idx]; |
| 223 | + |
| 224 | + for (int step = 0; step < n_steps; step++) { |
| 225 | + int idx = dim_idx + step * n_dims; |
| 226 | + val = decays[idx] * val + impulses[idx]; |
| 227 | + out[idx] = val; |
| 228 | + } |
| 229 | + } |
| 230 | +} |
| 231 | + |
| 232 | +extern "C" { |
| 233 | +/* |
| 234 | + * This is the main method for the prefix sum kernels. |
| 235 | + * decays, impulses, out: |
| 236 | + * each a n_dims x n_steps column major matrix located on GPU |
| 237 | + * initial_state: |
| 238 | + * array of size n_dims located on GPU |
| 239 | + */ |
| 240 | +void compute_linear_recurrence(float *decays, float *impulses, |
| 241 | + float *initial_state, float *out, int n_dims, |
| 242 | + int n_steps) { |
| 243 | + // TODO: query |
| 244 | + int n_SMs = 15; |
| 245 | + int n_blocks_per_sm = 2; |
| 246 | + |
| 247 | + // we want at least 32 elements per block, but no reason to run |
| 248 | + // with more than the maximum number of concurrent blocks |
| 249 | + int n_blocks = min(CEIL_DIV(n_steps, 32), n_SMs * n_blocks_per_sm); |
| 250 | + |
| 251 | + // TODO: make user pass in working memory? This allows integration |
| 252 | + // with CNMeM (used by Theano) |
| 253 | + int reduction_mem_sz = 2 * n_blocks * 33 * n_dims * sizeof(float); |
| 254 | + float *d_reduction_mem; |
| 255 | + gpuErrChk(cudaMalloc(&d_reduction_mem, reduction_mem_sz)); |
| 256 | + float *d_decay_storage = &d_reduction_mem[0 * n_blocks * 33 * n_dims]; |
| 257 | + float *d_h_storage = &d_reduction_mem[1 * n_blocks * 33 * n_dims]; |
| 258 | + |
| 259 | + // TODO: run kernels on non-default stream? |
| 260 | + reduction_kernel<<<n_blocks, 1024>>>(decays, impulses, initial_state, |
| 261 | + d_decay_storage, d_h_storage, n_dims, |
| 262 | + n_steps); |
| 263 | + |
| 264 | + block_scan_kernel<<<n_blocks, 1024>>>(d_decay_storage, d_h_storage, n_dims, |
| 265 | + n_blocks); |
| 266 | + |
| 267 | + warp_scan_kernel<<<n_blocks, 1024>>>(decays, impulses, initial_state, out, |
| 268 | + d_decay_storage, d_h_storage, n_dims, |
| 269 | + n_steps); |
| 270 | + |
| 271 | + gpuErrChk(cudaFree(d_reduction_mem)); |
| 272 | +} |
| 273 | + |
| 274 | +void compute_serial_linear_recurrence(float *decays, float *impulses, |
| 275 | + float *initial_state, float *out, |
| 276 | + int n_dims, int n_steps) { |
| 277 | + // TODO: query |
| 278 | + int n_SMs = 15; |
| 279 | + int n_blocks_per_sm = 2; |
| 280 | + |
| 281 | + int n_blocks = n_SMs * n_blocks_per_sm; |
| 282 | + serial_linear_recurrence<<<n_blocks, 1024>>>( |
| 283 | + decays, impulses, initial_state, out, n_dims, n_steps); |
| 284 | +} |
| 285 | +} |
| 286 | + |
| 287 | +void test() { |
| 288 | + int n_dims = 100; |
| 289 | + int n_steps = 1000000; |
| 290 | + int n_elements = n_dims * n_steps; |
| 291 | + |
| 292 | + float *decays = (float *)calloc(n_elements, sizeof(float)); |
| 293 | + for (int i = 0; i < n_elements; i++) { |
| 294 | + decays[i] = .999; |
| 295 | + } |
| 296 | + float *d_decays; |
| 297 | + gpuErrChk(cudaMalloc(&d_decays, n_elements * sizeof(float))); |
| 298 | + gpuErrChk(cudaMemcpy(d_decays, decays, n_elements * sizeof(float), |
| 299 | + cudaMemcpyHostToDevice)); |
| 300 | + |
| 301 | + float *impulses = (float *)calloc(n_elements, sizeof(float)); |
| 302 | + for (int i = 0; i < n_dims; i++) { |
| 303 | + impulses[i + 0 * n_dims] = 2.0; |
| 304 | + } |
| 305 | + float *d_impulses; |
| 306 | + gpuErrChk(cudaMalloc(&d_impulses, n_elements * sizeof(float))); |
| 307 | + gpuErrChk(cudaMemcpy(d_impulses, impulses, n_elements * sizeof(float), |
| 308 | + cudaMemcpyHostToDevice)); |
| 309 | + |
| 310 | + float *out = (float *)calloc(n_elements, sizeof(float)); |
| 311 | + float *d_out; |
| 312 | + gpuErrChk(cudaMalloc(&d_out, n_elements * sizeof(float))); |
| 313 | + gpuErrChk(cudaMemset(d_out, 0, n_elements * sizeof(float))); |
| 314 | + |
| 315 | + compute_linear_recurrence(d_decays, d_impulses, NULL, d_out, n_dims, |
| 316 | + n_steps); |
| 317 | + gpuErrChk(cudaMemcpy(out, d_out, n_elements * sizeof(float), |
| 318 | + cudaMemcpyDeviceToHost)); |
| 319 | + |
| 320 | + gpuErrChk(cudaFree(d_decays)); |
| 321 | + gpuErrChk(cudaFree(d_impulses)); |
| 322 | + gpuErrChk(cudaFree(d_out)); |
| 323 | +} |
0 commit comments