From 12edf97387b6712b0e0f7937ae2abae7198f5aa1 Mon Sep 17 00:00:00 2001 From: sbalint98 Date: Thu, 20 May 2021 15:20:30 +0200 Subject: [PATCH 1/4] [cublas] introduce onemkl_cublas_host_task [cublas] move dpc++ internal headers into cublas_task.hpp --- src/blas/backends/cublas/cublas_batch.cpp | 22 +- .../backends/cublas/cublas_extensions.cpp | 4 +- src/blas/backends/cublas/cublas_level1.cpp | 190 +++++------ src/blas/backends/cublas/cublas_level2.cpp | 306 +++++++++--------- src/blas/backends/cublas/cublas_level3.cpp | 131 ++++---- .../backends/cublas/cublas_scope_handle.cpp | 2 +- .../backends/cublas/cublas_scope_handle.hpp | 5 +- src/blas/backends/cublas/cublas_task.hpp | 33 ++ 8 files changed, 365 insertions(+), 328 deletions(-) create mode 100644 src/blas/backends/cublas/cublas_task.hpp diff --git a/src/blas/backends/cublas/cublas_batch.cpp b/src/blas/backends/cublas/cublas_batch.cpp index 6fd32a62e..6bc6d73e8 100644 --- a/src/blas/backends/cublas/cublas_batch.cpp +++ b/src/blas/backends/cublas/cublas_batch.cpp @@ -16,9 +16,9 @@ * limitations under the License. * **************************************************************************/ -#include #include "cublas_helper.hpp" -#include "cublas_scope_handle.hpp" +#include "cublas_task.hpp" + #include "oneapi/mkl/exceptions.hpp" #include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp" @@ -42,12 +42,12 @@ inline void gemm_batch(Func func, cl::sycl::queue &queue, transpose transa, tran auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); - auto c_ = sc.get_mem(ih, c_acc); + + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_operation(transa), get_cublas_operation(transb), m, n, k, (cuDataType *)&alpha, a_, lda, @@ -122,9 +122,9 @@ inline cl::sycl::event gemm_batch(Func func, cl::sycl::queue &queue, transpose t for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); auto c_ = reinterpret_cast(c); @@ -170,9 +170,9 @@ inline cl::sycl::event gemm_batch(Func func, cl::sycl::queue &queue, transpose * for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + int64_t offset = 0; cublasStatus_t err; for (int64_t i = 0; i < group_count; i++) { diff --git a/src/blas/backends/cublas/cublas_extensions.cpp b/src/blas/backends/cublas/cublas_extensions.cpp index 3df6c8360..1c7d1b7f9 100644 --- a/src/blas/backends/cublas/cublas_extensions.cpp +++ b/src/blas/backends/cublas/cublas_extensions.cpp @@ -16,9 +16,9 @@ * limitations under the License. * **************************************************************************/ -#include #include "cublas_helper.hpp" -#include "cublas_scope_handle.hpp" +#include "cublas_task.hpp" + #include "oneapi/mkl/exceptions.hpp" #include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp" diff --git a/src/blas/backends/cublas/cublas_level1.cpp b/src/blas/backends/cublas/cublas_level1.cpp index 8ee228ab6..a9e2b7274 100644 --- a/src/blas/backends/cublas/cublas_level1.cpp +++ b/src/blas/backends/cublas/cublas_level1.cpp @@ -17,11 +17,12 @@ * **************************************************************************/ #include "cublas_helper.hpp" -#include "cublas_scope_handle.hpp" +#include "cublas_task.hpp" + +#include "cublas_task.hpp" #include "oneapi/mkl/exceptions.hpp" #include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp" -#include namespace oneapi { namespace mkl { @@ -42,8 +43,7 @@ inline void asum(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto res_acc = result.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -51,8 +51,8 @@ inline void asum(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(ih, x_acc); - auto res_ = sc.get_mem(ih, res_acc); + auto x_ = sc.get_mem(x_acc); + auto res_ = sc.get_mem(res_acc); cublasStatus_t err; // ASUM does not support negative index CUBLAS_ERROR_FUNC(func, err, handle, n, x_, std::abs(incx), res_); @@ -83,10 +83,9 @@ inline void scal(Func func, cl::sycl::queue &queue, int64_t n, T1 a, cl::sycl::b overflow_check(n, incx); queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(ih, x_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; // SCAL does not support negative incx CUBLAS_ERROR_FUNC(func, err, handle, n, (cuDataType1 *)&a, x_, std::abs(incx)); @@ -115,11 +114,11 @@ inline void axpy(Func func, cl::sycl::queue &queue, int64_t n, T alpha, cl::sycl queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, n, (cuDataType *)&alpha, x_, incx, y_, incy); }); @@ -149,19 +148,19 @@ inline void rotg(Func func, cl::sycl::queue &queue, cl::sycl::buffer &a, auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); auto s_acc = s.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); - auto c_ = sc.get_mem(ih, c_acc); - auto s_ = sc.get_mem(ih, s_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); + auto s_ = sc.get_mem(s_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, a_, b_, c_, s_); // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST @@ -195,18 +194,18 @@ inline void rotm(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); auto param_acc = param.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); - auto param_ = sc.get_mem(ih, param_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); + auto param_ = sc.get_mem(param_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, n, x_, incx, y_, incy, param_); // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST @@ -235,11 +234,11 @@ inline void copy(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, n, x_, incx, y_, incy); }); @@ -268,18 +267,18 @@ inline void dot(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer(cgh); auto y_acc = y.template get_access(cgh); auto res_acc = result.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); - auto res_ = sc.get_mem(ih, res_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); + auto res_ = sc.get_mem(res_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, n, x_, incx, y_, incy, res_); // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST @@ -314,11 +313,16 @@ inline void rot(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST + // when the data is on buffer, it must be set to + // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation + // fault. When it is set to device it is users responsibility to + // synchronise as the function is completely asynchronous. + // cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, n, x_, incx, y_, incy, (cuDataType2 *)&c, (cuDataType3 *)&s); @@ -347,18 +351,18 @@ void sdsdot(cl::sycl::queue &queue, int64_t n, float sb, cl::sycl::buffer(cgh); auto y_acc = y.get_access(cgh); auto res_acc = result.get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); - auto res_ = sc.get_mem(ih, res_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); + auto res_ = sc.get_mem(res_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(cublasSdot, err, handle, n, x_, incx, y_, incy, res_); // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST @@ -389,20 +393,20 @@ inline void rotmg(Func func, cl::sycl::queue &queue, cl::sycl::buffer &d1, auto x1_acc = x1.template get_access(cgh); auto y1_acc = y1_buff.template get_access(cgh); auto param_acc = param.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto d1_ = sc.get_mem(ih, d1_acc); - auto d2_ = sc.get_mem(ih, d2_acc); - auto x1_ = sc.get_mem(ih, x1_acc); - auto y1_ = sc.get_mem(ih, y1_acc); - auto param_ = sc.get_mem(ih, param_acc); + auto d1_ = sc.get_mem(d1_acc); + auto d2_ = sc.get_mem(d2_acc); + auto x1_ = sc.get_mem(x1_acc); + auto y1_ = sc.get_mem(y1_acc); + auto param_ = sc.get_mem(param_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, d1_, d2_, x1_, y1_, param_); // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST @@ -440,17 +444,17 @@ inline void iamax(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto int_res_acc = int_res_buff.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(ih, x_acc); - auto int_res_ = sc.get_mem(ih, int_res_acc); + auto x_ = sc.get_mem(x_acc); + auto int_res_ = sc.get_mem(int_res_acc); cublasStatus_t err; // For negative incx, iamax returns 0. This behaviour is similar to that of // reference netlib BLAS. @@ -487,11 +491,11 @@ inline void swap(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, n, x_, incx, y_, incy); }); @@ -526,17 +530,17 @@ inline void iamin(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto int_res_acc = int_res_buff.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(ih, x_acc); - auto int_res_ = sc.get_mem(ih, int_res_acc); + auto x_ = sc.get_mem(x_acc); + auto int_res_ = sc.get_mem(int_res_acc); cublasStatus_t err; // For negative incx, iamin returns 0. This behaviour is similar to that of // implemented as a reference IAMIN. @@ -573,17 +577,17 @@ inline void nrm2(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto res_acc = result.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(ih, x_acc); - auto res_ = sc.get_mem(ih, res_acc); + auto x_ = sc.get_mem(x_acc); + auto res_ = sc.get_mem(res_acc); cublasStatus_t err; // NRM2 does not support negative index CUBLAS_ERROR_FUNC(func, err, handle, n, x_, std::abs(incx), res_); @@ -622,9 +626,9 @@ inline cl::sycl::event asum(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto res_ = reinterpret_cast(result); cublasStatus_t err; @@ -658,9 +662,9 @@ inline cl::sycl::event scal(Func func, cl::sycl::queue &queue, int64_t n, T1 a, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); cublasStatus_t err; // SCAL does not support negative incx @@ -694,9 +698,9 @@ inline cl::sycl::event axpy(Func func, cl::sycl::queue &queue, int64_t n, T alph for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); cublasStatus_t err; @@ -729,9 +733,9 @@ inline cl::sycl::event rotg(Func func, cl::sycl::queue &queue, T1 *a, T1 *b, T2 for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); auto c_ = reinterpret_cast(c); @@ -766,9 +770,9 @@ inline cl::sycl::event rotm(Func func, cl::sycl::queue &queue, int64_t n, T *x, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); auto param_ = reinterpret_cast(param); @@ -801,9 +805,9 @@ inline cl::sycl::event copy(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); cublasStatus_t err; @@ -837,9 +841,9 @@ inline cl::sycl::event dot(Func func, cl::sycl::queue &queue, int64_t n, const T for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); auto res_ = reinterpret_cast(result); @@ -877,9 +881,9 @@ inline cl::sycl::event rot(Func func, cl::sycl::queue &queue, int64_t n, T1 *x, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); cublasStatus_t err; @@ -913,9 +917,9 @@ cl::sycl::event sdsdot(cl::sycl::queue &queue, int64_t n, float sb, const float for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); auto res_ = reinterpret_cast(result); @@ -942,9 +946,9 @@ inline cl::sycl::event rotmg(Func func, cl::sycl::queue &queue, T *d1, T *d2, T for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto d1_ = reinterpret_cast(d1); auto d2_ = reinterpret_cast(d2); auto x1_ = reinterpret_cast(x1); @@ -986,9 +990,9 @@ inline cl::sycl::event iamax(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto int_res_p_ = reinterpret_cast(int_res_p); cublasStatus_t err; @@ -1025,9 +1029,9 @@ inline cl::sycl::event swap(Func func, cl::sycl::queue &queue, int64_t n, T *x, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); cublasStatus_t err; @@ -1068,9 +1072,9 @@ inline cl::sycl::event iamin(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto int_res_p_ = reinterpret_cast(int_res_p); cublasStatus_t err; @@ -1108,9 +1112,9 @@ inline cl::sycl::event nrm2(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto x_ = reinterpret_cast(x); auto res_ = reinterpret_cast(result); cublasStatus_t err; diff --git a/src/blas/backends/cublas/cublas_level2.cpp b/src/blas/backends/cublas/cublas_level2.cpp index db23a3eb6..94fa79093 100644 --- a/src/blas/backends/cublas/cublas_level2.cpp +++ b/src/blas/backends/cublas/cublas_level2.cpp @@ -16,9 +16,9 @@ * limitations under the License. * **************************************************************************/ -#include #include "cublas_helper.hpp" -#include "cublas_scope_handle.hpp" +#include "cublas_task.hpp" + #include "oneapi/mkl/exceptions.hpp" #include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp" @@ -40,12 +40,12 @@ inline void gemv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_operation(trans), m, n, (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_, @@ -78,12 +78,12 @@ inline void gbmv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_operation(trans), m, n, kl, ku, (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_, @@ -116,12 +116,12 @@ inline void ger(Func func, cl::sycl::queue &queue, int64_t m, int64_t n, T alpha auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, m, n, (cuDataType *)&alpha, x_, incx, y_, incy, a_, lda); @@ -154,12 +154,12 @@ inline void hbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, k, (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_, @@ -189,12 +189,12 @@ inline void hemv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_, @@ -225,11 +225,11 @@ inline void her(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuScalarType *)&alpha, x_, incx, a_, lda); @@ -259,12 +259,12 @@ inline void her2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, incx, y_, incy, a_, lda); @@ -294,12 +294,12 @@ inline void hpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_, x_, incx, (cuDataType *)&beta, y_, incy); @@ -328,11 +328,11 @@ inline void hpr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuScalarType *)&alpha, x_, incx, a_); @@ -361,12 +361,12 @@ inline void hpr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, incx, y_, incy, a_); @@ -396,12 +396,12 @@ inline void sbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, k, (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_, @@ -432,12 +432,12 @@ inline void symv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_, @@ -466,11 +466,11 @@ inline void syr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, incx, a_, lda); @@ -502,12 +502,12 @@ inline void syr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, incx, y_, incy, a_, lda); @@ -540,12 +540,12 @@ inline void spmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_, x_, incx, (cuDataType *)&beta, y_, incy); @@ -573,11 +573,11 @@ inline void spr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, incx, a_); @@ -606,12 +606,12 @@ inline void spr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); - auto y_ = sc.get_mem(ih, y_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, incx, y_, incy, a_); @@ -640,11 +640,11 @@ inline void tbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, k, @@ -676,11 +676,11 @@ inline void tbsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, k, @@ -712,11 +712,11 @@ inline void tpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, a_, @@ -748,11 +748,11 @@ inline void tpsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, a_, @@ -784,11 +784,11 @@ inline void trmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, a_, @@ -820,11 +820,11 @@ inline void trsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto x_ = sc.get_mem(ih, x_acc); + + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, a_, @@ -861,9 +861,9 @@ inline cl::sycl::event gemv(Func func, cl::sycl::queue &queue, transpose trans, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -903,9 +903,9 @@ inline cl::sycl::event gbmv(Func func, cl::sycl::queue &queue, transpose trans, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -944,9 +944,9 @@ inline cl::sycl::event ger(Func func, cl::sycl::queue &queue, int64_t m, int64_t for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -986,9 +986,9 @@ inline cl::sycl::event hbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1026,9 +1026,9 @@ inline cl::sycl::event hemv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1068,9 +1068,9 @@ inline cl::sycl::event her(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1105,9 +1105,9 @@ inline cl::sycl::event her2(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1144,9 +1144,9 @@ inline cl::sycl::event hpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1184,9 +1184,9 @@ inline cl::sycl::event hpr(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1220,9 +1220,9 @@ inline cl::sycl::event hpr2(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1259,9 +1259,9 @@ inline cl::sycl::event sbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1300,9 +1300,9 @@ inline cl::sycl::event symv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1340,9 +1340,9 @@ inline cl::sycl::event syr(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1378,9 +1378,9 @@ inline cl::sycl::event syr2(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1420,9 +1420,9 @@ inline cl::sycl::event spmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1459,9 +1459,9 @@ inline cl::sycl::event spr(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1495,9 +1495,9 @@ inline cl::sycl::event spr2(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); auto y_ = reinterpret_cast(y); @@ -1534,9 +1534,9 @@ inline cl::sycl::event tbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1576,9 +1576,9 @@ inline cl::sycl::event tbsv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1617,9 +1617,9 @@ inline cl::sycl::event tpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1657,9 +1657,9 @@ inline cl::sycl::event tpsv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1697,9 +1697,9 @@ inline cl::sycl::event trmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; @@ -1738,9 +1738,9 @@ inline cl::sycl::event trsv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto x_ = reinterpret_cast(x); cublasStatus_t err; diff --git a/src/blas/backends/cublas/cublas_level3.cpp b/src/blas/backends/cublas/cublas_level3.cpp index 671a15ea7..ad06517e4 100644 --- a/src/blas/backends/cublas/cublas_level3.cpp +++ b/src/blas/backends/cublas/cublas_level3.cpp @@ -16,9 +16,9 @@ * limitations under the License. * **************************************************************************/ -#include #include "cublas_helper.hpp" -#include "cublas_scope_handle.hpp" +#include "cublas_task.hpp" + #include "oneapi/mkl/exceptions.hpp" #include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp" @@ -41,12 +41,11 @@ inline void gemm(Func func, cl::sycl::queue &queue, transpose transa, transpose auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); - auto c_ = sc.get_mem(ih, c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_operation(transa), get_cublas_operation(transb), m, n, k, (cuDataType *)&alpha, a_, lda, @@ -85,12 +84,12 @@ inline void gemm(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); - auto c_ = sc.get_mem(ih, c_acc); + + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_operation(transa), get_cublas_operation(transb), m, n, k, (cuDataType_C *)&alpha, a_, @@ -126,12 +125,12 @@ inline void symm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_ auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); - auto c_ = sc.get_mem(ih, c_acc); + + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_side_mode(left_right), get_cublas_fill_mode(upper_lower), m, n, (cuDataType *)&alpha, a_, @@ -166,12 +165,12 @@ inline void hemm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_ auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); - auto c_ = sc.get_mem(ih, c_acc); + + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_side_mode(left_right), get_cublas_fill_mode(upper_lower), m, n, (cuDataType *)&alpha, a_, @@ -201,11 +200,11 @@ inline void syrk(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto c_ = sc.get_mem(ih, c_acc); + + auto a_ = sc.get_mem(a_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), n, k, (cuDataType *)&alpha, a_, lda, @@ -238,11 +237,11 @@ inline void herk(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto c_ = sc.get_mem(ih, c_acc); + + auto a_ = sc.get_mem(a_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), n, k, (cuScalarType *)&alpha, a_, lda, @@ -274,12 +273,12 @@ inline void syr2k(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); - auto c_ = sc.get_mem(ih, c_acc); + + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), n, k, (cuDataType *)&alpha, a_, lda, b_, @@ -315,12 +314,12 @@ inline void her2k(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); - auto c_ = sc.get_mem(ih, c_acc); + + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), n, k, (cuDataType *)&alpha, a_, lda, b_, @@ -356,11 +355,11 @@ inline void trmm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_ queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); + + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_side_mode(left_right), get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), @@ -393,11 +392,11 @@ inline void trsm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_ queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(ih, a_acc); - auto b_ = sc.get_mem(ih, b_acc); + + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_side_mode(left_right), get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), @@ -435,9 +434,9 @@ inline cl::sycl::event gemm(Func func, cl::sycl::queue &queue, transpose transa, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); auto c_ = reinterpret_cast(c); @@ -485,9 +484,9 @@ inline cl::sycl::event symm(Func func, cl::sycl::queue &queue, side left_right, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); auto c_ = reinterpret_cast(c); @@ -528,9 +527,9 @@ inline cl::sycl::event hemm(Func func, cl::sycl::queue &queue, side left_right, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); auto c_ = reinterpret_cast(c); @@ -568,9 +567,9 @@ inline cl::sycl::event syrk(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto c_ = reinterpret_cast(c); cublasStatus_t err; @@ -611,9 +610,9 @@ inline cl::sycl::event herk(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto c_ = reinterpret_cast(c); cublasStatus_t err; @@ -651,9 +650,9 @@ inline cl::sycl::event syr2k(Func func, cl::sycl::queue &queue, uplo upper_lower for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); auto c_ = reinterpret_cast(c); @@ -695,9 +694,9 @@ inline cl::sycl::event her2k(Func func, cl::sycl::queue &queue, uplo upper_lower for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); auto c_ = reinterpret_cast(c); @@ -741,9 +740,9 @@ inline cl::sycl::event trmm(Func func, cl::sycl::queue &queue, side left_right, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); cublasStatus_t err; @@ -783,9 +782,9 @@ inline cl::sycl::event trsm(Func func, cl::sycl::queue &queue, side left_right, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - cgh.interop_task([=](cl::sycl::interop_handler ih) { - auto sc = CublasScopedContextHandler(queue); + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { auto handle = sc.get_handle(queue); + auto a_ = reinterpret_cast(a); auto b_ = reinterpret_cast(b); cublasStatus_t err; diff --git a/src/blas/backends/cublas/cublas_scope_handle.cpp b/src/blas/backends/cublas/cublas_scope_handle.cpp index e6ce1b4dc..beab19501 100644 --- a/src/blas/backends/cublas/cublas_scope_handle.cpp +++ b/src/blas/backends/cublas/cublas_scope_handle.cpp @@ -48,7 +48,7 @@ cublas_handle::~cublas_handle() noexcept(false) { */ thread_local cublas_handle CublasScopedContextHandler::handle_helper = cublas_handle{}; -CublasScopedContextHandler::CublasScopedContextHandler(cl::sycl::queue queue) { +CublasScopedContextHandler::CublasScopedContextHandler(cl::sycl::queue queue, cl::sycl::interop_handler& ih): ih(ih){ placedContext_ = queue.get_context(); auto device = queue.get_device(); auto desired = cl::sycl::get_native(placedContext_); diff --git a/src/blas/backends/cublas/cublas_scope_handle.hpp b/src/blas/backends/cublas/cublas_scope_handle.hpp index e73c63225..6aca620d0 100644 --- a/src/blas/backends/cublas/cublas_scope_handle.hpp +++ b/src/blas/backends/cublas/cublas_scope_handle.hpp @@ -68,12 +68,13 @@ class CublasScopedContextHandler { CUcontext original_; cl::sycl::context placedContext_; bool needToRecover_; + cl::sycl::interop_handler& ih; static thread_local cublas_handle handle_helper; CUstream get_stream(const cl::sycl::queue &queue); cl::sycl::context get_context(const cl::sycl::queue &queue); public: - CublasScopedContextHandler(cl::sycl::queue queue); + CublasScopedContextHandler(cl::sycl::queue queue, cl::sycl::interop_handler& ih); ~CublasScopedContextHandler() noexcept(false); /** @@ -87,7 +88,7 @@ class CublasScopedContextHandler { // This is a work-around function for reinterpret_casting the memory. This // will be fixed when SYCL-2020 has been implemented for Pi backend. template - inline T get_mem(cl::sycl::interop_handler ih, U acc) { + inline T get_mem(U acc) { CUdeviceptr cudaPtr = ih.get_mem(acc); return reinterpret_cast(cudaPtr); } diff --git a/src/blas/backends/cublas/cublas_task.hpp b/src/blas/backends/cublas/cublas_task.hpp new file mode 100644 index 000000000..fd150cc37 --- /dev/null +++ b/src/blas/backends/cublas/cublas_task.hpp @@ -0,0 +1,33 @@ +#ifndef _MKL_BLAS_CUBLAS_TASK_HPP_ +#define _MKL_BLAS_CUBLAS_TASK_HPP_ +#include +#include +#include +#include +#include "oneapi/mkl/types.hpp" +#include "cublas_scope_handle.hpp" +#include + +namespace oneapi { +namespace mkl { +namespace blas { +namespace cublas { + +template +static inline auto host_task_internal(H &cgh, cl::sycl::queue queue, F f) -> decltype(cgh.interop_task(f)) { + cgh.interop_task([f, queue](cl::sycl::interop_handler ih){ + auto sc = CublasScopedContextHandler(queue, ih); + f(sc); + }); +} + +template +static inline void onemkl_cublas_host_task(H &cgh, cl::sycl::queue queue, F f) { + (void)host_task_internal(cgh, queue, f); +} + +} // namespace cublas +} // namespace blas +} // namespace mkl +} // namespace oneapi +#endif // _MKL_BLAS_CUBLAS_TASK_HPP_ \ No newline at end of file From 761b5e64840031d7c934b11e27b7eebda2dc6966 Mon Sep 17 00:00:00 2001 From: sbalint98 Date: Mon, 31 May 2021 15:04:18 +0200 Subject: [PATCH 2/4] [cublas] pass sc by reference --- src/blas/backends/cublas/cublas_batch.cpp | 6 +- src/blas/backends/cublas/cublas_level1.cpp | 56 ++++++------- src/blas/backends/cublas/cublas_level2.cpp | 92 +++++++++++----------- src/blas/backends/cublas/cublas_level3.cpp | 38 ++++----- 4 files changed, 96 insertions(+), 96 deletions(-) diff --git a/src/blas/backends/cublas/cublas_batch.cpp b/src/blas/backends/cublas/cublas_batch.cpp index 6bc6d73e8..26ee4b2e5 100644 --- a/src/blas/backends/cublas/cublas_batch.cpp +++ b/src/blas/backends/cublas/cublas_batch.cpp @@ -42,7 +42,7 @@ inline void gemm_batch(Func func, cl::sycl::queue &queue, transpose transa, tran auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -122,7 +122,7 @@ inline cl::sycl::event gemm_batch(Func func, cl::sycl::queue &queue, transpose t for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -170,7 +170,7 @@ inline cl::sycl::event gemm_batch(Func func, cl::sycl::queue &queue, transpose * for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int64_t offset = 0; diff --git a/src/blas/backends/cublas/cublas_level1.cpp b/src/blas/backends/cublas/cublas_level1.cpp index a9e2b7274..9a5909a0a 100644 --- a/src/blas/backends/cublas/cublas_level1.cpp +++ b/src/blas/backends/cublas/cublas_level1.cpp @@ -43,7 +43,7 @@ inline void asum(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto res_acc = result.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -83,7 +83,7 @@ inline void scal(Func func, cl::sycl::queue &queue, int64_t n, T1 a, cl::sycl::b overflow_check(n, incx); queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto x_ = sc.get_mem(x_acc); cublasStatus_t err; @@ -114,7 +114,7 @@ inline void axpy(Func func, cl::sycl::queue &queue, int64_t n, T alpha, cl::sycl queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto x_ = sc.get_mem(x_acc); @@ -148,7 +148,7 @@ inline void rotg(Func func, cl::sycl::queue &queue, cl::sycl::buffer &a, auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); auto s_acc = s.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST @@ -194,7 +194,7 @@ inline void rotm(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); auto param_acc = param.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST @@ -234,7 +234,7 @@ inline void copy(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto x_ = sc.get_mem(x_acc); @@ -267,7 +267,7 @@ inline void dot(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer(cgh); auto y_acc = y.template get_access(cgh); auto res_acc = result.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST @@ -313,7 +313,7 @@ inline void rot(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -351,7 +351,7 @@ void sdsdot(cl::sycl::queue &queue, int64_t n, float sb, cl::sycl::buffer(cgh); auto y_acc = y.get_access(cgh); auto res_acc = result.get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST @@ -393,7 +393,7 @@ inline void rotmg(Func func, cl::sycl::queue &queue, cl::sycl::buffer &d1, auto x1_acc = x1.template get_access(cgh); auto y1_acc = y1_buff.template get_access(cgh); auto param_acc = param.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST @@ -444,7 +444,7 @@ inline void iamax(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto int_res_acc = int_res_buff.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST @@ -491,7 +491,7 @@ inline void swap(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto x_ = sc.get_mem(x_acc); @@ -530,7 +530,7 @@ inline void iamin(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto int_res_acc = int_res_buff.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST @@ -577,7 +577,7 @@ inline void nrm2(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer< queue.submit([&](cl::sycl::handler &cgh) { auto x_acc = x.template get_access(cgh); auto res_acc = result.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST @@ -626,7 +626,7 @@ inline cl::sycl::event asum(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); @@ -662,7 +662,7 @@ inline cl::sycl::event scal(Func func, cl::sycl::queue &queue, int64_t n, T1 a, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); @@ -698,7 +698,7 @@ inline cl::sycl::event axpy(Func func, cl::sycl::queue &queue, int64_t n, T alph for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); @@ -733,7 +733,7 @@ inline cl::sycl::event rotg(Func func, cl::sycl::queue &queue, T1 *a, T1 *b, T2 for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -770,7 +770,7 @@ inline cl::sycl::event rotm(Func func, cl::sycl::queue &queue, int64_t n, T *x, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); @@ -805,7 +805,7 @@ inline cl::sycl::event copy(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); @@ -841,7 +841,7 @@ inline cl::sycl::event dot(Func func, cl::sycl::queue &queue, int64_t n, const T for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); @@ -881,7 +881,7 @@ inline cl::sycl::event rot(Func func, cl::sycl::queue &queue, int64_t n, T1 *x, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); @@ -917,7 +917,7 @@ cl::sycl::event sdsdot(cl::sycl::queue &queue, int64_t n, float sb, const float for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); @@ -946,7 +946,7 @@ inline cl::sycl::event rotmg(Func func, cl::sycl::queue &queue, T *d1, T *d2, T for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto d1_ = reinterpret_cast(d1); @@ -990,7 +990,7 @@ inline cl::sycl::event iamax(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); @@ -1029,7 +1029,7 @@ inline cl::sycl::event swap(Func func, cl::sycl::queue &queue, int64_t n, T *x, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); @@ -1072,7 +1072,7 @@ inline cl::sycl::event iamin(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); @@ -1112,7 +1112,7 @@ inline cl::sycl::event nrm2(Func func, cl::sycl::queue &queue, int64_t n, const for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto x_ = reinterpret_cast(x); diff --git a/src/blas/backends/cublas/cublas_level2.cpp b/src/blas/backends/cublas/cublas_level2.cpp index 94fa79093..8e7122690 100644 --- a/src/blas/backends/cublas/cublas_level2.cpp +++ b/src/blas/backends/cublas/cublas_level2.cpp @@ -40,7 +40,7 @@ inline void gemv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -78,7 +78,7 @@ inline void gbmv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -116,7 +116,7 @@ inline void ger(Func func, cl::sycl::queue &queue, int64_t m, int64_t n, T alpha auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -154,7 +154,7 @@ inline void hbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -189,7 +189,7 @@ inline void hemv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -225,7 +225,7 @@ inline void her(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -259,7 +259,7 @@ inline void her2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -294,7 +294,7 @@ inline void hpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -328,7 +328,7 @@ inline void hpr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -361,7 +361,7 @@ inline void hpr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -396,7 +396,7 @@ inline void sbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -432,7 +432,7 @@ inline void symv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -466,7 +466,7 @@ inline void syr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -502,7 +502,7 @@ inline void syr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -540,7 +540,7 @@ inline void spmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -573,7 +573,7 @@ inline void spr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -606,7 +606,7 @@ inline void spr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -640,7 +640,7 @@ inline void tbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -676,7 +676,7 @@ inline void tbsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -712,7 +712,7 @@ inline void tpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -748,7 +748,7 @@ inline void tpsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -784,7 +784,7 @@ inline void trmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -820,7 +820,7 @@ inline void trsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -861,7 +861,7 @@ inline cl::sycl::event gemv(Func func, cl::sycl::queue &queue, transpose trans, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -903,7 +903,7 @@ inline cl::sycl::event gbmv(Func func, cl::sycl::queue &queue, transpose trans, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -944,7 +944,7 @@ inline cl::sycl::event ger(Func func, cl::sycl::queue &queue, int64_t m, int64_t for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -986,7 +986,7 @@ inline cl::sycl::event hbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1026,7 +1026,7 @@ inline cl::sycl::event hemv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1068,7 +1068,7 @@ inline cl::sycl::event her(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1105,7 +1105,7 @@ inline cl::sycl::event her2(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1144,7 +1144,7 @@ inline cl::sycl::event hpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1184,7 +1184,7 @@ inline cl::sycl::event hpr(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1220,7 +1220,7 @@ inline cl::sycl::event hpr2(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1259,7 +1259,7 @@ inline cl::sycl::event sbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1300,7 +1300,7 @@ inline cl::sycl::event symv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1340,7 +1340,7 @@ inline cl::sycl::event syr(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1378,7 +1378,7 @@ inline cl::sycl::event syr2(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1420,7 +1420,7 @@ inline cl::sycl::event spmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1459,7 +1459,7 @@ inline cl::sycl::event spr(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1495,7 +1495,7 @@ inline cl::sycl::event spr2(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1534,7 +1534,7 @@ inline cl::sycl::event tbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1576,7 +1576,7 @@ inline cl::sycl::event tbsv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1617,7 +1617,7 @@ inline cl::sycl::event tpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1657,7 +1657,7 @@ inline cl::sycl::event tpsv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1697,7 +1697,7 @@ inline cl::sycl::event trmv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -1738,7 +1738,7 @@ inline cl::sycl::event trsv(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); diff --git a/src/blas/backends/cublas/cublas_level3.cpp b/src/blas/backends/cublas/cublas_level3.cpp index ad06517e4..5cdbd495d 100644 --- a/src/blas/backends/cublas/cublas_level3.cpp +++ b/src/blas/backends/cublas/cublas_level3.cpp @@ -41,7 +41,7 @@ inline void gemm(Func func, cl::sycl::queue &queue, transpose transa, transpose auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); auto b_ = sc.get_mem(b_acc); @@ -84,7 +84,7 @@ inline void gemm(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -125,7 +125,7 @@ inline void symm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_ auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -165,7 +165,7 @@ inline void hemm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_ auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -200,7 +200,7 @@ inline void syrk(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -237,7 +237,7 @@ inline void herk(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -273,7 +273,7 @@ inline void syr2k(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -314,7 +314,7 @@ inline void her2k(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -355,7 +355,7 @@ inline void trmm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_ queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -392,7 +392,7 @@ inline void trsm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_ queue.submit([&](cl::sycl::handler &cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = sc.get_mem(a_acc); @@ -434,7 +434,7 @@ inline cl::sycl::event gemm(Func func, cl::sycl::queue &queue, transpose transa, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -484,7 +484,7 @@ inline cl::sycl::event symm(Func func, cl::sycl::queue &queue, side left_right, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -527,7 +527,7 @@ inline cl::sycl::event hemm(Func func, cl::sycl::queue &queue, side left_right, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -567,7 +567,7 @@ inline cl::sycl::event syrk(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -610,7 +610,7 @@ inline cl::sycl::event herk(Func func, cl::sycl::queue &queue, uplo upper_lower, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -650,7 +650,7 @@ inline cl::sycl::event syr2k(Func func, cl::sycl::queue &queue, uplo upper_lower for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -694,7 +694,7 @@ inline cl::sycl::event her2k(Func func, cl::sycl::queue &queue, uplo upper_lower for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -740,7 +740,7 @@ inline cl::sycl::event trmm(Func func, cl::sycl::queue &queue, side left_right, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); @@ -782,7 +782,7 @@ inline cl::sycl::event trsm(Func func, cl::sycl::queue &queue, side left_right, for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) { + onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); auto a_ = reinterpret_cast(a); From 73a7f14389f6464667925426a24f5a934bf291b0 Mon Sep 17 00:00:00 2001 From: sbalint98 Date: Mon, 31 May 2021 15:43:14 +0200 Subject: [PATCH 3/4] [cublas] remove double include from cublas_level1.cpp --- src/blas/backends/cublas/cublas_level1.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/blas/backends/cublas/cublas_level1.cpp b/src/blas/backends/cublas/cublas_level1.cpp index 9a5909a0a..61e75d223 100644 --- a/src/blas/backends/cublas/cublas_level1.cpp +++ b/src/blas/backends/cublas/cublas_level1.cpp @@ -19,7 +19,6 @@ #include "cublas_helper.hpp" #include "cublas_task.hpp" -#include "cublas_task.hpp" #include "oneapi/mkl/exceptions.hpp" #include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp" From 34b7dae765d3763f1f4085b5f18285289112f748 Mon Sep 17 00:00:00 2001 From: sbalint98 Date: Mon, 31 May 2021 15:47:51 +0200 Subject: [PATCH 4/4] [cublas] remove trailing return type from host_task_internal It was neceassry for SFINAE but we use ifdefs instead --- src/blas/backends/cublas/cublas_task.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blas/backends/cublas/cublas_task.hpp b/src/blas/backends/cublas/cublas_task.hpp index fd150cc37..04fc239c4 100644 --- a/src/blas/backends/cublas/cublas_task.hpp +++ b/src/blas/backends/cublas/cublas_task.hpp @@ -14,7 +14,7 @@ namespace blas { namespace cublas { template -static inline auto host_task_internal(H &cgh, cl::sycl::queue queue, F f) -> decltype(cgh.interop_task(f)) { +static inline void host_task_internal(H &cgh, cl::sycl::queue queue, F f) { cgh.interop_task([f, queue](cl::sycl::interop_handler ih){ auto sc = CublasScopedContextHandler(queue, ih); f(sc);