Skip to content

Commit a1a7814

Browse files
committed
Merge pull request #434 from kylelutz/parameter-cache
Add parameter_cache class
2 parents 238ab82 + 4175a85 commit a1a7814

File tree

13 files changed

+684
-209
lines changed

13 files changed

+684
-209
lines changed

.travis.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@ compiler:
33
- gcc
44
- clang
55
before_install:
6+
- sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
67
- sudo apt-get update -qq
7-
- sudo apt-get install -qq fglrx=2:8.960-0ubuntu1 opencl-headers libboost-chrono1.48-dev libboost-date-time1.48-dev libboost-test1.48-dev libboost-system1.48-dev libboost-filesystem1.48-dev libboost-timer1.48-dev libboost-program-options1.48-dev libboost-thread1.48-dev python-yaml lcov libopencv-dev
8+
- sudo apt-get install -qq fglrx=2:8.960-0ubuntu1 opencl-headers libboost-chrono1.48-dev libboost-date-time1.48-dev libboost-test1.48-dev libboost-system1.48-dev libboost-filesystem1.48-dev libboost-timer1.48-dev libboost-program-options1.48-dev libboost-thread1.48-dev python-yaml lcov libopencv-dev g++-4.8
89
- gem install coveralls-lcov
910
script:
1011
- mkdir -p build

include/boost/compute/algorithm/detail/binary_find.hpp

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <boost/compute/algorithm/find_if.hpp>
1616
#include <boost/compute/algorithm/transform.hpp>
1717
#include <boost/compute/command_queue.hpp>
18+
#include <boost/compute/detail/parameter_cache.hpp>
1819

1920
namespace boost {
2021
namespace compute {
@@ -28,11 +29,9 @@ namespace detail{
2829
class binary_find_kernel : public meta_kernel
2930
{
3031
public:
31-
size_t threads;
32-
33-
binary_find_kernel() : meta_kernel("binary_find")
32+
binary_find_kernel(size_t threads) : meta_kernel("binary_find")
3433
{
35-
threads = 128;
34+
m_threads = threads;
3635
}
3736

3837
template<class InputIterator, class UnaryPredicate>
@@ -41,7 +40,7 @@ class binary_find_kernel : public meta_kernel
4140
UnaryPredicate predicate)
4241
{
4342
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
44-
int block = (iterator_range_size(first, last)-1)/(threads-1);
43+
int block = (iterator_range_size(first, last)-1)/(m_threads-1);
4544

4645
m_index_arg = add_arg<uint_ *>(memory_object::global_memory, "index");
4746

@@ -60,10 +59,11 @@ class binary_find_kernel : public meta_kernel
6059
{
6160
set_arg(m_index_arg, index.get_buffer());
6261

63-
return exec_1d(queue, 0, threads);
62+
return exec_1d(queue, 0, m_threads);
6463
}
6564

6665
private:
66+
size_t m_threads;
6767
size_t m_index_arg;
6868
};
6969

@@ -84,16 +84,23 @@ inline InputIterator binary_find(InputIterator first,
8484
UnaryPredicate predicate,
8585
command_queue &queue = system::default_queue())
8686
{
87+
const device &device = queue.get_device();
88+
89+
boost::shared_ptr<parameter_cache> parameters =
90+
detail::parameter_cache::get_global_cache(device);
91+
92+
const std::string cache_key = "__boost_binary_find";
93+
8794
size_t find_if_limit = 128;
88-
size_t threads = 128;
95+
size_t threads = parameters->get(cache_key, "tpb", 128);
8996
size_t count = iterator_range_size(first, last);
9097

9198
while(count > find_if_limit) {
9299

93100
scalar<uint_> index(queue.get_context());
94101
index.write(static_cast<uint_>(count), queue);
95102

96-
binary_find_kernel kernel;
103+
binary_find_kernel kernel(threads);
97104
kernel.set_range(first, last, predicate);
98105
kernel.exec(queue, index);
99106

include/boost/compute/algorithm/detail/copy_on_device.hpp

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <boost/compute/memory/svm_ptr.hpp>
2121
#include <boost/compute/detail/iterator_range_size.hpp>
2222
#include <boost/compute/detail/meta_kernel.hpp>
23+
#include <boost/compute/detail/parameter_cache.hpp>
2324
#include <boost/compute/detail/work_size.hpp>
2425

2526
namespace boost {
@@ -42,12 +43,21 @@ template<class InputIterator, class OutputIterator>
4243
class copy_kernel : public meta_kernel
4344
{
4445
public:
45-
copy_kernel()
46+
copy_kernel(const device &device)
4647
: meta_kernel("copy")
4748
{
4849
m_count = 0;
49-
m_vpt = 4;
50-
m_tpb = 128;
50+
51+
typedef typename std::iterator_traits<InputIterator>::value_type input_type;
52+
53+
boost::shared_ptr<parameter_cache> parameters =
54+
detail::parameter_cache::get_global_cache(device);
55+
56+
std::string cache_key =
57+
"__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type));
58+
59+
m_vpt = parameters->get(cache_key, "vpt", 4);
60+
m_tpb = parameters->get(cache_key, "tpb", 128);
5161
}
5262

5363
void set_range(InputIterator first,
@@ -97,7 +107,9 @@ inline OutputIterator copy_on_device(InputIterator first,
97107
OutputIterator result,
98108
command_queue &queue)
99109
{
100-
copy_kernel<InputIterator, OutputIterator> kernel;
110+
const device &device = queue.get_device();
111+
112+
copy_kernel<InputIterator, OutputIterator> kernel(device);
101113

102114
kernel.set_range(first, last, result);
103115
kernel.exec(queue);
@@ -122,7 +134,9 @@ inline future<OutputIterator> copy_on_device_async(InputIterator first,
122134
OutputIterator result,
123135
command_queue &queue)
124136
{
125-
copy_kernel<InputIterator, OutputIterator> kernel;
137+
const device &device = queue.get_device();
138+
139+
copy_kernel<InputIterator, OutputIterator> kernel(device);
126140

127141
kernel.set_range(first, last, result);
128142
event event_ = kernel.exec(queue);

include/boost/compute/algorithm/detail/inplace_reduce.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313

1414
#include <iterator>
1515

16+
#include <boost/utility/result_of.hpp>
17+
1618
#include <boost/compute/command_queue.hpp>
1719
#include <boost/compute/container/vector.hpp>
1820
#include <boost/compute/detail/iterator_range_size.hpp>

include/boost/compute/algorithm/detail/radix_sort.hpp

Lines changed: 25 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,10 @@
2323
#include <boost/compute/algorithm/exclusive_scan.hpp>
2424
#include <boost/compute/container/vector.hpp>
2525
#include <boost/compute/detail/iterator_range_size.hpp>
26+
#include <boost/compute/detail/parameter_cache.hpp>
27+
#include <boost/compute/type_traits/type_name.hpp>
2628
#include <boost/compute/type_traits/is_fundamental.hpp>
2729
#include <boost/compute/type_traits/is_vector_type.hpp>
28-
#include <boost/compute/type_traits/type_name.hpp>
2930
#include <boost/compute/utility/program_cache.hpp>
3031

3132
namespace boost {
@@ -232,19 +233,9 @@ inline void radix_sort_impl(const buffer_iterator<T> first,
232233
typedef T value_type;
233234
typedef typename radix_sort_value_type<sizeof(T)>::type sort_type;
234235

236+
const device &device = queue.get_device();
235237
const context &context = queue.get_context();
236238

237-
size_t count = detail::iterator_range_size(first, last);
238-
239-
// sort parameters
240-
const uint_ k = 4;
241-
const uint_ k2 = 1 << k;
242-
const uint_ block_size = 128;
243-
244-
uint_ block_count = static_cast<uint_>(count / block_size);
245-
if(block_count * block_size != count){
246-
block_count++;
247-
}
248239

249240
// if we have a valid values iterator then we are doing a
250241
// sort by key and have to set up the values buffer
@@ -258,6 +249,17 @@ inline void radix_sort_impl(const buffer_iterator<T> first,
258249
cache_key += std::string("_with_") + type_name<T2>();
259250
}
260251

252+
boost::shared_ptr<program_cache> cache =
253+
program_cache::get_global_cache(context);
254+
boost::shared_ptr<parameter_cache> parameters =
255+
detail::parameter_cache::get_global_cache(device);
256+
257+
// sort parameters
258+
const uint_ k = parameters->get(cache_key, "k", 4);
259+
const uint_ k2 = 1 << k;
260+
const uint_ block_size = parameters->get(cache_key, "tpb", 128);
261+
262+
// sort program compiler options
261263
std::stringstream options;
262264
options << "-DK_BITS=" << k;
263265
options << " -DT=" << type_name<sort_type>();
@@ -277,17 +279,22 @@ inline void radix_sort_impl(const buffer_iterator<T> first,
277279
options << enable_double<T2>();
278280
}
279281

280-
// load (or create) radix sort program
281-
boost::shared_ptr<program_cache> cache =
282-
program_cache::get_global_cache(context);
283-
284-
program radix_sort_program =
285-
cache->get_or_build(cache_key, options.str(), radix_sort_source, context);
282+
// load radix sort program
283+
program radix_sort_program = cache->get_or_build(
284+
cache_key, options.str(), radix_sort_source, context
285+
);
286286

287287
kernel count_kernel(radix_sort_program, "count");
288288
kernel scan_kernel(radix_sort_program, "scan");
289289
kernel scatter_kernel(radix_sort_program, "scatter");
290290

291+
size_t count = detail::iterator_range_size(first, last);
292+
293+
uint_ block_count = static_cast<uint_>(count / block_size);
294+
if(block_count * block_size != count){
295+
block_count++;
296+
}
297+
291298
// setup temporary buffers
292299
vector<value_type> output(count, context);
293300
vector<T2> values_output(sort_by_key ? count : 0, context);

include/boost/compute/algorithm/detail/reduce_on_gpu.hpp

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,15 @@
1313

1414
#include <iterator>
1515

16+
#include <boost/compute/utility/source.hpp>
1617
#include <boost/compute/program.hpp>
1718
#include <boost/compute/command_queue.hpp>
1819
#include <boost/compute/detail/vendor.hpp>
20+
#include <boost/compute/detail/parameter_cache.hpp>
1921
#include <boost/compute/detail/work_size.hpp>
2022
#include <boost/compute/detail/meta_kernel.hpp>
21-
#include <boost/compute/type_traits/result_of.hpp>
2223
#include <boost/compute/type_traits/type_name.hpp>
2324
#include <boost/compute/utility/program_cache.hpp>
24-
#include <boost/compute/utility/source.hpp>
2525

2626
namespace boost {
2727
namespace compute {
@@ -99,7 +99,7 @@ inline void initial_reduce(InputIterator first,
9999
(void) reduce_kernel;
100100

101101
typedef typename std::iterator_traits<InputIterator>::value_type Arg;
102-
typedef typename ::boost::compute::result_of<Function(Arg, Arg)>::type T;
102+
typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;
103103

104104
size_t count = std::distance(first, last);
105105
detail::meta_kernel k("initial_reduce");
@@ -174,6 +174,7 @@ inline void reduce_on_gpu(InputIterator first,
174174
command_queue &queue)
175175
{
176176
const device &device = queue.get_device();
177+
const context &context = queue.get_context();
177178

178179
detail::meta_kernel k("reduce");
179180
k.add_arg<const T*>(memory_object::global_memory, "input");
@@ -210,28 +211,34 @@ inline void reduce_on_gpu(InputIterator first,
210211
" output[output_offset + get_group_id(0)] = scratch[0];\n" <<
211212
"}\n";
212213

213-
uint_ vpt = 8;
214-
uint_ tpb = 128;
214+
std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>();
215215

216-
size_t count = std::distance(first, last);
216+
// load parameters
217+
boost::shared_ptr<parameter_cache> parameters =
218+
detail::parameter_cache::get_global_cache(device);
217219

218-
const context &context = queue.get_context();
220+
uint_ vpt = parameters->get(cache_key, "vpt", 8);
221+
uint_ tpb = parameters->get(cache_key, "tpb", 128);
222+
223+
// reduce program compiler flags
224+
std::stringstream options;
225+
options << "-DT=" << type_name<T>()
226+
<< " -DVPT=" << vpt
227+
<< " -DTPB=" << tpb;
219228

220-
// load (or create) reduce program
229+
// load program
221230
boost::shared_ptr<program_cache> cache =
222231
program_cache::get_global_cache(context);
223232

224-
std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>();
225-
226-
std::stringstream options;
227-
options << "-DT=" << type_name<T>() << " -DVPT=" << vpt << " -DTPB=" << tpb;
228-
229-
program reduce_program =
230-
cache->get_or_build(cache_key, options.str(), k.source(), context);
233+
program reduce_program = cache->get_or_build(
234+
cache_key, options.str(), k.source(), context
235+
);
231236

232237
// create reduce kernel
233238
kernel reduce_kernel(reduce_program, "reduce");
234239

240+
size_t count = std::distance(first, last);
241+
235242
// first pass, reduce from input to ping
236243
buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T));
237244
initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue);

0 commit comments

Comments
 (0)