Skip to content

Commit ad77bbb

Browse files
committed
feat: add linear recurrence with MIT license
1 parent d372cee commit ad77bbb

File tree

3 files changed

+352
-0
lines changed

3 files changed

+352
-0
lines changed
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
Copyright (c) <2017> <eric@ericmart.in>
2+
3+
Permission is hereby granted, free of charge, to any person obtaining a copy
4+
of this software and associated documentation files (the "Software"), to deal
5+
in the Software without restriction, including without limitation the rights
6+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7+
copies of the Software, and to permit persons to whom the Software is
8+
furnished to do so, subject to the following conditions:
9+
10+
The above copyright notice and this permission notice shall be included in all
11+
copies or substantial portions of the Software.
12+
13+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19+
SOFTWARE.
Lines changed: 323 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,323 @@
1+
#include <assert.h>
2+
#include <stdio.h>
3+
4+
#define CEIL_DIV(x, y) ((x + y - 1) / y)
5+
6+
#define gpuErrChk(ans) \
7+
{ \
8+
gpuAssert((ans), __FILE__, __LINE__); \
9+
}
10+
void gpuAssert(cudaError_t code, const char *file, int line) {
11+
if (code != cudaSuccess) {
12+
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
13+
line);
14+
}
15+
}
16+
17+
__device__ int2 divide_work(int n_jobs, int n_workers, int worker_idx) {
18+
// Each worker will do a continuous slice of either n_jobs / n_workers
19+
// or ceil_div(n_jobs, n_workers). The return value is an int2 representing
20+
// a half open interval of jobs for the worker to perform (perform jobs
21+
// i for a <= i < b)
22+
23+
int cd = CEIL_DIV(n_jobs, n_workers);
24+
int d = n_jobs / n_workers;
25+
26+
int doing_cd = n_jobs % n_workers;
27+
28+
int2 retval;
29+
if (worker_idx < doing_cd) {
30+
retval.x = worker_idx * cd;
31+
retval.y = retval.x + cd;
32+
} else {
33+
retval.x = doing_cd * cd + (worker_idx - doing_cd) * d;
34+
retval.y = retval.x + d;
35+
}
36+
37+
return retval;
38+
}
39+
40+
__device__ int2 compute_warp_start_stop(int block_idx, int warp_idx,
41+
int n_blocks, int n_steps) {
42+
int2 block_ss = divide_work(n_steps, n_blocks, block_idx);
43+
int block_start = block_ss.x;
44+
int block_stop = block_ss.y;
45+
int block_jobs = block_stop - block_start;
46+
47+
int2 warp_ss = divide_work(block_jobs, 32, warp_idx);
48+
int warp_start = block_start + warp_ss.x;
49+
int warp_stop = block_start + warp_ss.y;
50+
51+
int2 retval;
52+
retval.x = warp_start;
53+
retval.y = warp_stop;
54+
return retval;
55+
}
56+
57+
// decay storage, h_storage:
58+
// each a n_dims x 33 x n_blocks matrix on GPU with 33rd column for block
59+
// reduction
60+
__global__ void reduction_kernel(float *decays, float *impulses,
61+
float *initial_state, float *_decay_storage,
62+
float *_h_storage, int n_dims, int n_steps) {
63+
int warp = threadIdx.x / 32;
64+
int lane = threadIdx.x % 32;
65+
66+
float *decay_storage = &_decay_storage[blockIdx.x * 33 * n_dims];
67+
float *h_storage = &_h_storage[blockIdx.x * 33 * n_dims];
68+
69+
int2 start_stop =
70+
compute_warp_start_stop(blockIdx.x, warp, gridDim.x, n_steps);
71+
int warp_start = start_stop.x;
72+
int warp_stop = start_stop.y;
73+
74+
/*
75+
* Reduce within warps.
76+
* After this loop exits, the storage arrays should contain the reduction
77+
* from warp_start to warp_stop (including initial state) at index
78+
* (feature_idx, warp, block).
79+
*/
80+
for (int i = lane; i < n_dims; i += 32) {
81+
float cum_decay = 1.0;
82+
float h = 0.0;
83+
if (blockIdx.x == 0 && warp == 0 && initial_state != NULL) {
84+
h = initial_state[i];
85+
}
86+
87+
for (int t = warp_start; t < warp_stop; t++) {
88+
cum_decay *= decays[i + t * n_dims];
89+
h = decays[i + t * n_dims] * h + impulses[i + t * n_dims];
90+
}
91+
92+
// TODO: store into shared memory, work in shared memory sized blocks
93+
// store into global memory
94+
decay_storage[i + warp * n_dims] = cum_decay;
95+
h_storage[i + warp * n_dims] = h;
96+
}
97+
98+
__syncthreads();
99+
100+
/*
101+
* Reduce over warps.
102+
* After this loop exits, the storage arrays should contain the reduction
103+
* from block_start to block_finish (including initial state) at index
104+
* (feature_idx, 32, block).
105+
*/
106+
// TODO: parallel reduction (or scan). Need to worry about changing the warp
107+
// reduction values (as I use them again later)
108+
for (int i = lane + 32 * warp; i < n_dims; i += blockDim.x) {
109+
float cum_decay = 1.0;
110+
float h = 0.0;
111+
for (int t = 0; t < 32; t++) {
112+
cum_decay *= decay_storage[i + t * n_dims];
113+
h = decay_storage[i + t * n_dims] * h + h_storage[i + t * n_dims];
114+
}
115+
decay_storage[i + 32 * n_dims] = cum_decay;
116+
h_storage[i + 32 * n_dims] = h;
117+
}
118+
}
119+
120+
__global__ void block_scan_kernel(float *decay_storage, float *h_storage,
121+
int n_dims, int n_blocks) {
122+
/*
123+
* Scan over blocks.
124+
* After this loop exits, the storage arrays should contain the cumulative
125+
* sum from block_idx 0 to i (inclusive) at index (feature_idx, 32, i) This
126+
* means (feature_idx, 32, 2) contains the reduction of blocks 0, 1, and 2.
127+
*/
128+
// TODO: parallel scan (tricky because number of blocks isn't necessarily
129+
// smaller than number of warps that can fit in a single block)
130+
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n_dims;
131+
i += blockDim.x * gridDim.x) {
132+
for (int t = 1; t < n_blocks; t++) {
133+
int cur_idx = i + 32 * n_dims + t * 33 * n_dims;
134+
int prev_idx = i + 32 * n_dims + (t - 1) * 33 * n_dims;
135+
136+
// TODO: remove unneccessary reads from global memory (prev_idx
137+
// accesses)
138+
h_storage[cur_idx] = decay_storage[cur_idx] * h_storage[prev_idx] +
139+
h_storage[cur_idx];
140+
decay_storage[cur_idx] *= decay_storage[prev_idx];
141+
}
142+
}
143+
}
144+
145+
__global__ void warp_scan_kernel(float *decays, float *impulses,
146+
float *initial_state, float *out,
147+
float *decay_storage, float *h_storage,
148+
int n_dims, int n_steps) {
149+
int warp = threadIdx.x / 32;
150+
int lane = threadIdx.x % 32;
151+
152+
// Note: Due to the index ordering of the storage arrays, the following
153+
// indices are equivalent:
154+
//
155+
// i + (t - 1) * n_dims + blockIdx.x * 33 * n_dims
156+
// i + 32 * n_dims + (blockIdx.x - 1) * 33 * n_dims
157+
//
158+
// when t is 0. This means something that looks like negative indexing
159+
// (t-1) can be used to safely access the stored value for the previous
160+
// warp (even if the previous warp belonged to the previous block).
161+
162+
/*
163+
* Scan over warps.
164+
* After this loop executes, the storage arrays should contain the
165+
* cumulative sum from the beginning of sequence (including initial
166+
* condition) up to and including the indexed warp and block.
167+
*/
168+
// TODO: parallel scan
169+
for (int i = lane + 32 * warp; i < n_dims; i += blockDim.x) {
170+
for (int t = 0; t < 32; t++) {
171+
if (t == 0 && blockIdx.x == 0) {
172+
// the reduction over warp 0 (including initial condition) is
173+
// correct val for scan, so there's no work to do
174+
continue;
175+
}
176+
177+
int cur_idx = i + t * n_dims + blockIdx.x * 33 * n_dims;
178+
int prev_idx = i + (t - 1) * n_dims + blockIdx.x * 33 * n_dims;
179+
h_storage[cur_idx] = decay_storage[cur_idx] * h_storage[prev_idx] +
180+
h_storage[cur_idx];
181+
decay_storage[cur_idx] *= decay_storage[prev_idx];
182+
}
183+
}
184+
185+
__syncthreads();
186+
187+
int2 start_stop =
188+
compute_warp_start_stop(blockIdx.x, warp, gridDim.x, n_steps);
189+
int warp_start = start_stop.x;
190+
int warp_stop = start_stop.y;
191+
192+
/*
193+
* Scan within warps.
194+
* This loop writes to the output array. Each warp reads in it's initial
195+
* state (either from the "initial_state" or the storage arrays) and then
196+
* writes to output for indices warp_start up to warp_stop.
197+
*/
198+
for (int i = lane; i < n_dims; i += 32) {
199+
float h = 0.0;
200+
if (blockIdx.x == 0 && warp == 0) {
201+
if (initial_state != NULL) {
202+
h = initial_state[i];
203+
}
204+
} else {
205+
h = h_storage[i + (warp - 1) * n_dims + blockIdx.x * 33 * n_dims];
206+
}
207+
208+
for (int t = warp_start; t < warp_stop; t++) {
209+
h = decays[i + t * n_dims] * h + impulses[i + t * n_dims];
210+
out[i + t * n_dims] = h;
211+
}
212+
}
213+
}
214+
215+
__global__ void serial_linear_recurrence(float *decays, float *impulses,
216+
float *initial_state, float *out,
217+
int n_dims, int n_steps) {
218+
// computes h_t = lambda_t h{t-1} + x_t
219+
220+
for (int dim_idx = threadIdx.x + blockIdx.x * blockDim.x; dim_idx < n_dims;
221+
dim_idx += blockDim.x * gridDim.x) {
222+
float val = initial_state[dim_idx];
223+
224+
for (int step = 0; step < n_steps; step++) {
225+
int idx = dim_idx + step * n_dims;
226+
val = decays[idx] * val + impulses[idx];
227+
out[idx] = val;
228+
}
229+
}
230+
}
231+
232+
extern "C" {
233+
/*
234+
* This is the main method for the prefix sum kernels.
235+
* decays, impulses, out:
236+
* each a n_dims x n_steps column major matrix located on GPU
237+
* initial_state:
238+
* array of size n_dims located on GPU
239+
*/
240+
void compute_linear_recurrence(float *decays, float *impulses,
241+
float *initial_state, float *out, int n_dims,
242+
int n_steps) {
243+
// TODO: query
244+
int n_SMs = 15;
245+
int n_blocks_per_sm = 2;
246+
247+
// we want at least 32 elements per block, but no reason to run
248+
// with more than the maximum number of concurrent blocks
249+
int n_blocks = min(CEIL_DIV(n_steps, 32), n_SMs * n_blocks_per_sm);
250+
251+
// TODO: make user pass in working memory? This allows integration
252+
// with CNMeM (used by Theano)
253+
int reduction_mem_sz = 2 * n_blocks * 33 * n_dims * sizeof(float);
254+
float *d_reduction_mem;
255+
gpuErrChk(cudaMalloc(&d_reduction_mem, reduction_mem_sz));
256+
float *d_decay_storage = &d_reduction_mem[0 * n_blocks * 33 * n_dims];
257+
float *d_h_storage = &d_reduction_mem[1 * n_blocks * 33 * n_dims];
258+
259+
// TODO: run kernels on non-default stream?
260+
reduction_kernel<<<n_blocks, 1024>>>(decays, impulses, initial_state,
261+
d_decay_storage, d_h_storage, n_dims,
262+
n_steps);
263+
264+
block_scan_kernel<<<n_blocks, 1024>>>(d_decay_storage, d_h_storage, n_dims,
265+
n_blocks);
266+
267+
warp_scan_kernel<<<n_blocks, 1024>>>(decays, impulses, initial_state, out,
268+
d_decay_storage, d_h_storage, n_dims,
269+
n_steps);
270+
271+
gpuErrChk(cudaFree(d_reduction_mem));
272+
}
273+
274+
void compute_serial_linear_recurrence(float *decays, float *impulses,
275+
float *initial_state, float *out,
276+
int n_dims, int n_steps) {
277+
// TODO: query
278+
int n_SMs = 15;
279+
int n_blocks_per_sm = 2;
280+
281+
int n_blocks = n_SMs * n_blocks_per_sm;
282+
serial_linear_recurrence<<<n_blocks, 1024>>>(
283+
decays, impulses, initial_state, out, n_dims, n_steps);
284+
}
285+
}
286+
287+
void test() {
288+
int n_dims = 100;
289+
int n_steps = 1000000;
290+
int n_elements = n_dims * n_steps;
291+
292+
float *decays = (float *)calloc(n_elements, sizeof(float));
293+
for (int i = 0; i < n_elements; i++) {
294+
decays[i] = .999;
295+
}
296+
float *d_decays;
297+
gpuErrChk(cudaMalloc(&d_decays, n_elements * sizeof(float)));
298+
gpuErrChk(cudaMemcpy(d_decays, decays, n_elements * sizeof(float),
299+
cudaMemcpyHostToDevice));
300+
301+
float *impulses = (float *)calloc(n_elements, sizeof(float));
302+
for (int i = 0; i < n_dims; i++) {
303+
impulses[i + 0 * n_dims] = 2.0;
304+
}
305+
float *d_impulses;
306+
gpuErrChk(cudaMalloc(&d_impulses, n_elements * sizeof(float)));
307+
gpuErrChk(cudaMemcpy(d_impulses, impulses, n_elements * sizeof(float),
308+
cudaMemcpyHostToDevice));
309+
310+
float *out = (float *)calloc(n_elements, sizeof(float));
311+
float *d_out;
312+
gpuErrChk(cudaMalloc(&d_out, n_elements * sizeof(float)));
313+
gpuErrChk(cudaMemset(d_out, 0, n_elements * sizeof(float)));
314+
315+
compute_linear_recurrence(d_decays, d_impulses, NULL, d_out, n_dims,
316+
n_steps);
317+
gpuErrChk(cudaMemcpy(out, d_out, n_elements * sizeof(float),
318+
cudaMemcpyDeviceToHost));
319+
320+
gpuErrChk(cudaFree(d_decays));
321+
gpuErrChk(cudaFree(d_impulses));
322+
gpuErrChk(cudaFree(d_out));
323+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
extern "C" {
2+
void compute_linear_recurrence(
3+
const float *decays, /* n_steps x n_dims row major matrix */
4+
const float *impulses, /* n_steps x n_dims row major matrix */
5+
const float *initial_state, /* n_dims vector */
6+
float *out, /* n_steps x n_dims row major matrix */
7+
int n_dims, /* dimensionality of recurrent vector */
8+
int n_steps /* length of input and output sequences */
9+
);
10+
}

0 commit comments

Comments
 (0)