From 12edf97387b6712b0e0f7937ae2abae7198f5aa1 Mon Sep 17 00:00:00 2001
From: sbalint98 <sbsbalint14@gmail.com>
Date: Thu, 20 May 2021 15:20:30 +0200
Subject: [PATCH 1/4] [cublas] introduce onemkl_cublas_host_task [cublas] move
 dpc++ internal headers into cublas_task.hpp

---
 src/blas/backends/cublas/cublas_batch.cpp     |  22 +-
 .../backends/cublas/cublas_extensions.cpp     |   4 +-
 src/blas/backends/cublas/cublas_level1.cpp    | 190 +++++------
 src/blas/backends/cublas/cublas_level2.cpp    | 306 +++++++++---------
 src/blas/backends/cublas/cublas_level3.cpp    | 131 ++++----
 .../backends/cublas/cublas_scope_handle.cpp   |   2 +-
 .../backends/cublas/cublas_scope_handle.hpp   |   5 +-
 src/blas/backends/cublas/cublas_task.hpp      |  33 ++
 8 files changed, 365 insertions(+), 328 deletions(-)
 create mode 100644 src/blas/backends/cublas/cublas_task.hpp
diff --git a/src/blas/backends/cublas/cublas_batch.cpp b/src/blas/backends/cublas/cublas_batch.cpp
index 6fd32a62e..6bc6d73e8 100644
--- a/src/blas/backends/cublas/cublas_batch.cpp
+++ b/src/blas/backends/cublas/cublas_batch.cpp
@@ -16,9 +16,9 @@
 *  limitations under the License.
 *
 **************************************************************************/
-#include <CL/sycl/detail/pi.hpp>
 #include "cublas_helper.hpp"
-#include "cublas_scope_handle.hpp"
+#include "cublas_task.hpp"
+
 #include "oneapi/mkl/exceptions.hpp"
 #include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
 
@@ -42,12 +42,12 @@ inline void gemm_batch(Func func, cl::sycl::queue &queue, transpose transa, tran
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(ih, b_acc);
-            auto c_ = sc.get_mem<cuDataType *>(ih, c_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto b_ = sc.get_mem<cuDataType *>(b_acc);
+            auto c_ = sc.get_mem<cuDataType *>(c_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_operation(transa),
                               get_cublas_operation(transb), m, n, k, (cuDataType *)&alpha, a_, lda,
@@ -122,9 +122,9 @@ inline cl::sycl::event gemm_batch(Func func, cl::sycl::queue &queue, transpose t
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto b_ = reinterpret_cast<const cuDataType *>(b);
             auto c_ = reinterpret_cast<cuDataType *>(c);
@@ -170,9 +170,9 @@ inline cl::sycl::event gemm_batch(Func func, cl::sycl::queue &queue, transpose *
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             int64_t offset = 0;
             cublasStatus_t err;
             for (int64_t i = 0; i < group_count; i++) {
diff --git a/src/blas/backends/cublas/cublas_extensions.cpp b/src/blas/backends/cublas/cublas_extensions.cpp
index 3df6c8360..1c7d1b7f9 100644
--- a/src/blas/backends/cublas/cublas_extensions.cpp
+++ b/src/blas/backends/cublas/cublas_extensions.cpp
@@ -16,9 +16,9 @@
 *  limitations under the License.
 *
 **************************************************************************/
-#include <CL/sycl/detail/pi.hpp>
 #include "cublas_helper.hpp"
-#include "cublas_scope_handle.hpp"
+#include "cublas_task.hpp"
+
 #include "oneapi/mkl/exceptions.hpp"
 #include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
 
diff --git a/src/blas/backends/cublas/cublas_level1.cpp b/src/blas/backends/cublas/cublas_level1.cpp
index 8ee228ab6..a9e2b7274 100644
--- a/src/blas/backends/cublas/cublas_level1.cpp
+++ b/src/blas/backends/cublas/cublas_level1.cpp
@@ -17,11 +17,12 @@
 *
 **************************************************************************/
 #include "cublas_helper.hpp"
-#include "cublas_scope_handle.hpp"
+#include "cublas_task.hpp"
+
+#include "cublas_task.hpp"
 #include "oneapi/mkl/exceptions.hpp"
 #include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
 
-#include <CL/sycl/detail/pi.hpp>
 
 namespace oneapi {
 namespace mkl {
@@ -42,8 +43,7 @@ inline void asum(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto res_acc = result.template get_access<cl::sycl::access::mode::write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
             // when the data is on buffer, it must be set to
@@ -51,8 +51,8 @@ inline void asum(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<
             // fault. When it is set to device it is users responsibility to
             // synchronise as the function is completely asynchronous.
             cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto x_ = sc.get_mem<cuDataType1 *>(ih, x_acc);
-            auto res_ = sc.get_mem<cuDataType2 *>(ih, res_acc);
+            auto x_ = sc.get_mem<cuDataType1 *>(x_acc);
+            auto res_ = sc.get_mem<cuDataType2 *>(res_acc);
             cublasStatus_t err;
             // ASUM does not support negative index
             CUBLAS_ERROR_FUNC(func, err, handle, n, x_, std::abs(incx), res_);
@@ -83,10 +83,9 @@ inline void scal(Func func, cl::sycl::queue &queue, int64_t n, T1 a, cl::sycl::b
     overflow_check(n, incx);
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto x_ = sc.get_mem<cuDataType2 *>(ih, x_acc);
+            auto x_ = sc.get_mem<cuDataType2 *>(x_acc);
             cublasStatus_t err;
             // SCAL does not support negative incx
             CUBLAS_ERROR_FUNC(func, err, handle, n, (cuDataType1 *)&a, x_, std::abs(incx));
@@ -115,11 +114,11 @@ inline void axpy(Func func, cl::sycl::queue &queue, int64_t n, T alpha, cl::sycl
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
+
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, n, (cuDataType *)&alpha, x_, incx, y_, incy);
         });
@@ -149,19 +148,19 @@ inline void rotg(Func func, cl::sycl::queue &queue, cl::sycl::buffer<T1, 1> &a,
         auto b_acc = b.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto s_acc = s.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
             // when the data is on buffer, it must be set to
             // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
             // fault. When it is set to device it is users responsibility to
             // synchronise as the function is completely asynchronous.
             cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto a_ = sc.get_mem<cuDataType1 *>(ih, a_acc);
-            auto b_ = sc.get_mem<cuDataType1 *>(ih, b_acc);
-            auto c_ = sc.get_mem<cuDataType2 *>(ih, c_acc);
-            auto s_ = sc.get_mem<cuDataType1 *>(ih, s_acc);
+            auto a_ = sc.get_mem<cuDataType1 *>(a_acc);
+            auto b_ = sc.get_mem<cuDataType1 *>(b_acc);
+            auto c_ = sc.get_mem<cuDataType2 *>(c_acc);
+            auto s_ = sc.get_mem<cuDataType1 *>(s_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, a_, b_, c_, s_);
             // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST
@@ -195,18 +194,18 @@ inline void rotm(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto param_acc = param.template get_access<cl::sycl::access::mode::read>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
             // when the data is on buffer, it must be set to
             // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
             // fault. When it is set to device it is users responsibility to
             // synchronise as the function is completely asynchronous.
             cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
-            auto param_ = sc.get_mem<cuDataType *>(ih, param_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
+            auto param_ = sc.get_mem<cuDataType *>(param_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, n, x_, incx, y_, incy, param_);
             // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST
@@ -235,11 +234,11 @@ inline void copy(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
+
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, n, x_, incx, y_, incy);
         });
@@ -268,18 +267,18 @@ inline void dot(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<T
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read>(cgh);
         auto res_acc = result.template get_access<cl::sycl::access::mode::write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
             // when the data is on buffer, it must be set to
             // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
             // fault. When it is set to device it is users responsibility to
             // synchronise as the function is completely asynchronous.
             cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
-            auto res_ = sc.get_mem<cuDataType *>(ih, res_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
+            auto res_ = sc.get_mem<cuDataType *>(res_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, n, x_, incx, y_, incy, res_);
             // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST
@@ -314,11 +313,16 @@ inline void rot(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<T
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto x_ = sc.get_mem<cuDataType1 *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType1 *>(ih, y_acc);
+            // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
+            // when the data is on buffer, it must be set to
+            // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
+            // fault. When it is set to device it is users responsibility to
+            // synchronise as the function is completely asynchronous.
+            // cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
+            auto x_ = sc.get_mem<cuDataType1 *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType1 *>(y_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, n, x_, incx, y_, incy, (cuDataType2 *)&c,
                               (cuDataType3 *)&s);
@@ -347,18 +351,18 @@ void sdsdot(cl::sycl::queue &queue, int64_t n, float sb, cl::sycl::buffer<float,
         auto x_acc = x.get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.get_access<cl::sycl::access::mode::read>(cgh);
         auto res_acc = result.get_access<cl::sycl::access::mode::write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
             // when the data is on buffer, it must be set to
             // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
             // fault. When it is set to device it is users responsibility to
             // synchronise as the function is completely asynchronous.
             cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto x_ = sc.get_mem<float *>(ih, x_acc);
-            auto y_ = sc.get_mem<float *>(ih, y_acc);
-            auto res_ = sc.get_mem<float *>(ih, res_acc);
+            auto x_ = sc.get_mem<float *>(x_acc);
+            auto y_ = sc.get_mem<float *>(y_acc);
+            auto res_ = sc.get_mem<float *>(res_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(cublasSdot, err, handle, n, x_, incx, y_, incy, res_);
             // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST
@@ -389,20 +393,20 @@ inline void rotmg(Func func, cl::sycl::queue &queue, cl::sycl::buffer<T, 1> &d1,
         auto x1_acc = x1.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto y1_acc = y1_buff.template get_access<cl::sycl::access::mode::read>(cgh);
         auto param_acc = param.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
             // when the data is on buffer, it must be set to
             // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
             // fault. When it is set to device it is users responsibility to
             // synchronise as the function is completely asynchronous.
             cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto d1_ = sc.get_mem<cuDataType *>(ih, d1_acc);
-            auto d2_ = sc.get_mem<cuDataType *>(ih, d2_acc);
-            auto x1_ = sc.get_mem<cuDataType *>(ih, x1_acc);
-            auto y1_ = sc.get_mem<cuDataType *>(ih, y1_acc);
-            auto param_ = sc.get_mem<cuDataType *>(ih, param_acc);
+            auto d1_ = sc.get_mem<cuDataType *>(d1_acc);
+            auto d2_ = sc.get_mem<cuDataType *>(d2_acc);
+            auto x1_ = sc.get_mem<cuDataType *>(x1_acc);
+            auto y1_ = sc.get_mem<cuDataType *>(y1_acc);
+            auto param_ = sc.get_mem<cuDataType *>(param_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, d1_, d2_, x1_, y1_, param_);
             // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST
@@ -440,17 +444,17 @@ inline void iamax(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto int_res_acc = int_res_buff.template get_access<cl::sycl::access::mode::write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
             // when the data is on buffer, it must be set to
             // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
             // fault. When it is set to device it is users responsibility to
             // synchronise as the function is completely asynchronous.
             cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto int_res_ = sc.get_mem<int *>(ih, int_res_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto int_res_ = sc.get_mem<int *>(int_res_acc);
             cublasStatus_t err;
             // For negative incx, iamax returns 0. This behaviour is similar to that of
             // reference netlib BLAS.
@@ -487,11 +491,11 @@ inline void swap(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
+
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, n, x_, incx, y_, incy);
         });
@@ -526,17 +530,17 @@ inline void iamin(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto int_res_acc = int_res_buff.template get_access<cl::sycl::access::mode::write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
             // when the data is on buffer, it must be set to
             // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
             // fault. When it is set to device it is users responsibility to
             // synchronise as the function is completely asynchronous.
             cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto int_res_ = sc.get_mem<int *>(ih, int_res_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto int_res_ = sc.get_mem<int *>(int_res_acc);
             cublasStatus_t err;
             // For negative incx, iamin returns 0. This behaviour is similar to that of
             // implemented as a reference IAMIN.
@@ -573,17 +577,17 @@ inline void nrm2(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto res_acc = result.template get_access<cl::sycl::access::mode::write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
             // when the data is on buffer, it must be set to
             // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
             // fault. When it is set to device it is users responsibility to
             // synchronise as the function is completely asynchronous.
             cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto x_ = sc.get_mem<cuDataType1 *>(ih, x_acc);
-            auto res_ = sc.get_mem<cuDataType2 *>(ih, res_acc);
+            auto x_ = sc.get_mem<cuDataType1 *>(x_acc);
+            auto res_ = sc.get_mem<cuDataType2 *>(res_acc);
             cublasStatus_t err;
             // NRM2 does not support negative index
             CUBLAS_ERROR_FUNC(func, err, handle, n, x_, std::abs(incx), res_);
@@ -622,9 +626,9 @@ inline cl::sycl::event asum(Func func, cl::sycl::queue &queue, int64_t n, const
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto x_ = reinterpret_cast<const cuDataType1 *>(x);
             auto res_ = reinterpret_cast<cuDataType2 *>(result);
             cublasStatus_t err;
@@ -658,9 +662,9 @@ inline cl::sycl::event scal(Func func, cl::sycl::queue &queue, int64_t n, T1 a,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto x_ = reinterpret_cast<cuDataType2 *>(x);
             cublasStatus_t err;
             // SCAL does not support negative incx
@@ -694,9 +698,9 @@ inline cl::sycl::event axpy(Func func, cl::sycl::queue &queue, int64_t n, T alph
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto y_ = reinterpret_cast<cuDataType *>(y);
             cublasStatus_t err;
@@ -729,9 +733,9 @@ inline cl::sycl::event rotg(Func func, cl::sycl::queue &queue, T1 *a, T1 *b, T2
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<cuDataType1 *>(a);
             auto b_ = reinterpret_cast<cuDataType1 *>(b);
             auto c_ = reinterpret_cast<cuDataType2 *>(c);
@@ -766,9 +770,9 @@ inline cl::sycl::event rotm(Func func, cl::sycl::queue &queue, int64_t n, T *x,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto x_ = reinterpret_cast<cuDataType *>(x);
             auto y_ = reinterpret_cast<cuDataType *>(y);
             auto param_ = reinterpret_cast<cuDataType *>(param);
@@ -801,9 +805,9 @@ inline cl::sycl::event copy(Func func, cl::sycl::queue &queue, int64_t n, const
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto y_ = reinterpret_cast<cuDataType *>(y);
             cublasStatus_t err;
@@ -837,9 +841,9 @@ inline cl::sycl::event dot(Func func, cl::sycl::queue &queue, int64_t n, const T
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto y_ = reinterpret_cast<const cuDataType *>(y);
             auto res_ = reinterpret_cast<cuDataType *>(result);
@@ -877,9 +881,9 @@ inline cl::sycl::event rot(Func func, cl::sycl::queue &queue, int64_t n, T1 *x,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto x_ = reinterpret_cast<cuDataType1 *>(x);
             auto y_ = reinterpret_cast<cuDataType1 *>(y);
             cublasStatus_t err;
@@ -913,9 +917,9 @@ cl::sycl::event sdsdot(cl::sycl::queue &queue, int64_t n, float sb, const float
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto x_ = reinterpret_cast<const float *>(x);
             auto y_ = reinterpret_cast<const float *>(y);
             auto res_ = reinterpret_cast<float *>(result);
@@ -942,9 +946,9 @@ inline cl::sycl::event rotmg(Func func, cl::sycl::queue &queue, T *d1, T *d2, T
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto d1_ = reinterpret_cast<cuDataType *>(d1);
             auto d2_ = reinterpret_cast<cuDataType *>(d2);
             auto x1_ = reinterpret_cast<cuDataType *>(x1);
@@ -986,9 +990,9 @@ inline cl::sycl::event iamax(Func func, cl::sycl::queue &queue, int64_t n, const
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto int_res_p_ = reinterpret_cast<int *>(int_res_p);
             cublasStatus_t err;
@@ -1025,9 +1029,9 @@ inline cl::sycl::event swap(Func func, cl::sycl::queue &queue, int64_t n, T *x,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto x_ = reinterpret_cast<cuDataType *>(x);
             auto y_ = reinterpret_cast<cuDataType *>(y);
             cublasStatus_t err;
@@ -1068,9 +1072,9 @@ inline cl::sycl::event iamin(Func func, cl::sycl::queue &queue, int64_t n, const
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto int_res_p_ = reinterpret_cast<int *>(int_res_p);
             cublasStatus_t err;
@@ -1108,9 +1112,9 @@ inline cl::sycl::event nrm2(Func func, cl::sycl::queue &queue, int64_t n, const
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto x_ = reinterpret_cast<const cuDataType1 *>(x);
             auto res_ = reinterpret_cast<cuDataType2 *>(result);
             cublasStatus_t err;
diff --git a/src/blas/backends/cublas/cublas_level2.cpp b/src/blas/backends/cublas/cublas_level2.cpp
index db23a3eb6..94fa79093 100644
--- a/src/blas/backends/cublas/cublas_level2.cpp
+++ b/src/blas/backends/cublas/cublas_level2.cpp
@@ -16,9 +16,9 @@
 *  limitations under the License.
 *
 **************************************************************************/
-#include <CL/sycl/detail/pi.hpp>
 #include "cublas_helper.hpp"
-#include "cublas_scope_handle.hpp"
+#include "cublas_task.hpp"
+
 #include "oneapi/mkl/exceptions.hpp"
 #include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
 
@@ -40,12 +40,12 @@ inline void gemv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_operation(trans), m, n,
                               (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_,
@@ -78,12 +78,12 @@ inline void gbmv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_operation(trans), m, n, kl, ku,
                               (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_,
@@ -116,12 +116,12 @@ inline void ger(Func func, cl::sycl::queue &queue, int64_t m, int64_t n, T alpha
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, m, n, (cuDataType *)&alpha, x_, incx, y_, incy, a_,
                               lda);
@@ -154,12 +154,12 @@ inline void hbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, k,
                               (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_,
@@ -189,12 +189,12 @@ inline void hemv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n,
                               (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_,
@@ -225,11 +225,11 @@ inline void her(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n,
                               (cuScalarType *)&alpha, x_, incx, a_, lda);
@@ -259,12 +259,12 @@ inline void her2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n,
                               (cuDataType *)&alpha, x_, incx, y_, incy, a_, lda);
@@ -294,12 +294,12 @@ inline void hpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n,
                               (cuDataType *)&alpha, a_, x_, incx, (cuDataType *)&beta, y_, incy);
@@ -328,11 +328,11 @@ inline void hpr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n,
                               (cuScalarType *)&alpha, x_, incx, a_);
@@ -361,12 +361,12 @@ inline void hpr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n,
                               (cuDataType *)&alpha, x_, incx, y_, incy, a_);
@@ -396,12 +396,12 @@ inline void sbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n, k,
                               (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_,
@@ -432,12 +432,12 @@ inline void symv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n,
                               (cuDataType *)&alpha, a_, lda, x_, incx, (cuDataType *)&beta, y_,
@@ -466,11 +466,11 @@ inline void syr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n,
                               (cuDataType *)&alpha, x_, incx, a_, lda);
@@ -502,12 +502,12 @@ inline void syr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n,
                               (cuDataType *)&alpha, x_, incx, y_, incy, a_, lda);
@@ -540,12 +540,12 @@ inline void spmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n,
                               (cuDataType *)&alpha, a_, x_, incx, (cuDataType *)&beta, y_, incy);
@@ -573,11 +573,11 @@ inline void spr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n,
                               (cuDataType *)&alpha, x_, incx, a_);
@@ -606,12 +606,12 @@ inline void spr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(ih, y_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
+            auto y_ = sc.get_mem<cuDataType *>(y_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower), n,
                               (cuDataType *)&alpha, x_, incx, y_, incy, a_);
@@ -640,11 +640,11 @@ inline void tbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower),
                               get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, k,
@@ -676,11 +676,11 @@ inline void tbsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower),
                               get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, k,
@@ -712,11 +712,11 @@ inline void tpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower),
                               get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, a_,
@@ -748,11 +748,11 @@ inline void tpsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower),
                               get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, a_,
@@ -784,11 +784,11 @@ inline void trmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower),
                               get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, a_,
@@ -820,11 +820,11 @@ inline void trsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(ih, x_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto x_ = sc.get_mem<cuDataType *>(x_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower),
                               get_cublas_operation(trans), get_cublas_diag_type(unit_diag), n, a_,
@@ -861,9 +861,9 @@ inline cl::sycl::event gemv(Func func, cl::sycl::queue &queue, transpose trans,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto y_ = reinterpret_cast<cuDataType *>(y);
@@ -903,9 +903,9 @@ inline cl::sycl::event gbmv(Func func, cl::sycl::queue &queue, transpose trans,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto y_ = reinterpret_cast<cuDataType *>(y);
@@ -944,9 +944,9 @@ inline cl::sycl::event ger(Func func, cl::sycl::queue &queue, int64_t m, int64_t
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<cuDataType *>(a);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto y_ = reinterpret_cast<const cuDataType *>(y);
@@ -986,9 +986,9 @@ inline cl::sycl::event hbmv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto y_ = reinterpret_cast<cuDataType *>(y);
@@ -1026,9 +1026,9 @@ inline cl::sycl::event hemv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto y_ = reinterpret_cast<cuDataType *>(y);
@@ -1068,9 +1068,9 @@ inline cl::sycl::event her(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<cuDataType *>(a);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             cublasStatus_t err;
@@ -1105,9 +1105,9 @@ inline cl::sycl::event her2(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<cuDataType *>(a);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto y_ = reinterpret_cast<const cuDataType *>(y);
@@ -1144,9 +1144,9 @@ inline cl::sycl::event hpmv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto y_ = reinterpret_cast<cuDataType *>(y);
@@ -1184,9 +1184,9 @@ inline cl::sycl::event hpr(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<cuDataType *>(a);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             cublasStatus_t err;
@@ -1220,9 +1220,9 @@ inline cl::sycl::event hpr2(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<cuDataType *>(a);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto y_ = reinterpret_cast<const cuDataType *>(y);
@@ -1259,9 +1259,9 @@ inline cl::sycl::event sbmv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto y_ = reinterpret_cast<cuDataType *>(y);
@@ -1300,9 +1300,9 @@ inline cl::sycl::event symv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto y_ = reinterpret_cast<cuDataType *>(y);
@@ -1340,9 +1340,9 @@ inline cl::sycl::event syr(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<cuDataType *>(a);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             cublasStatus_t err;
@@ -1378,9 +1378,9 @@ inline cl::sycl::event syr2(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<cuDataType *>(a);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto y_ = reinterpret_cast<const cuDataType *>(y);
@@ -1420,9 +1420,9 @@ inline cl::sycl::event spmv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto y_ = reinterpret_cast<cuDataType *>(y);
@@ -1459,9 +1459,9 @@ inline cl::sycl::event spr(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<cuDataType *>(a);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             cublasStatus_t err;
@@ -1495,9 +1495,9 @@ inline cl::sycl::event spr2(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<cuDataType *>(a);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto y_ = reinterpret_cast<const cuDataType *>(y);
@@ -1534,9 +1534,9 @@ inline cl::sycl::event tbmv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto x_ = reinterpret_cast<cuDataType *>(x);
             cublasStatus_t err;
@@ -1576,9 +1576,9 @@ inline cl::sycl::event tbsv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto x_ = reinterpret_cast<cuDataType *>(x);
             cublasStatus_t err;
@@ -1617,9 +1617,9 @@ inline cl::sycl::event tpmv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto x_ = reinterpret_cast<cuDataType *>(x);
             cublasStatus_t err;
@@ -1657,9 +1657,9 @@ inline cl::sycl::event tpsv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto x_ = reinterpret_cast<cuDataType *>(x);
             cublasStatus_t err;
@@ -1697,9 +1697,9 @@ inline cl::sycl::event trmv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto x_ = reinterpret_cast<cuDataType *>(x);
             cublasStatus_t err;
@@ -1738,9 +1738,9 @@ inline cl::sycl::event trsv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto x_ = reinterpret_cast<cuDataType *>(x);
             cublasStatus_t err;
diff --git a/src/blas/backends/cublas/cublas_level3.cpp b/src/blas/backends/cublas/cublas_level3.cpp
index 671a15ea7..ad06517e4 100644
--- a/src/blas/backends/cublas/cublas_level3.cpp
+++ b/src/blas/backends/cublas/cublas_level3.cpp
@@ -16,9 +16,9 @@
 *  limitations under the License.
 *
 **************************************************************************/
-#include <CL/sycl/detail/pi.hpp>
 #include "cublas_helper.hpp"
-#include "cublas_scope_handle.hpp"
+#include "cublas_task.hpp"
+
 #include "oneapi/mkl/exceptions.hpp"
 #include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
 
@@ -41,12 +41,11 @@ inline void gemm(Func func, cl::sycl::queue &queue, transpose transa, transpose
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(ih, b_acc);
-            auto c_ = sc.get_mem<cuDataType *>(ih, c_acc);
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto b_ = sc.get_mem<cuDataType *>(b_acc);
+            auto c_ = sc.get_mem<cuDataType *>(c_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_operation(transa),
                               get_cublas_operation(transb), m, n, k, (cuDataType *)&alpha, a_, lda,
@@ -85,12 +84,12 @@ inline void gemm(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType_A *>(ih, a_acc);
-            auto b_ = sc.get_mem<cuDataType_B *>(ih, b_acc);
-            auto c_ = sc.get_mem<cuDataType_C *>(ih, c_acc);
+
+            auto a_ = sc.get_mem<cuDataType_A *>(a_acc);
+            auto b_ = sc.get_mem<cuDataType_B *>(b_acc);
+            auto c_ = sc.get_mem<cuDataType_C *>(c_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_operation(transa),
                               get_cublas_operation(transb), m, n, k, (cuDataType_C *)&alpha, a_,
@@ -126,12 +125,12 @@ inline void symm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(ih, b_acc);
-            auto c_ = sc.get_mem<cuDataType *>(ih, c_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto b_ = sc.get_mem<cuDataType *>(b_acc);
+            auto c_ = sc.get_mem<cuDataType *>(c_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_side_mode(left_right),
                               get_cublas_fill_mode(upper_lower), m, n, (cuDataType *)&alpha, a_,
@@ -166,12 +165,12 @@ inline void hemm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(ih, b_acc);
-            auto c_ = sc.get_mem<cuDataType *>(ih, c_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto b_ = sc.get_mem<cuDataType *>(b_acc);
+            auto c_ = sc.get_mem<cuDataType *>(c_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_side_mode(left_right),
                               get_cublas_fill_mode(upper_lower), m, n, (cuDataType *)&alpha, a_,
@@ -201,11 +200,11 @@ inline void syrk(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto c_ = sc.get_mem<cuDataType *>(ih, c_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto c_ = sc.get_mem<cuDataType *>(c_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower),
                               get_cublas_operation(trans), n, k, (cuDataType *)&alpha, a_, lda,
@@ -238,11 +237,11 @@ inline void herk(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto c_ = sc.get_mem<cuDataType *>(ih, c_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto c_ = sc.get_mem<cuDataType *>(c_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower),
                               get_cublas_operation(trans), n, k, (cuScalarType *)&alpha, a_, lda,
@@ -274,12 +273,12 @@ inline void syr2k(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(ih, b_acc);
-            auto c_ = sc.get_mem<cuDataType *>(ih, c_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto b_ = sc.get_mem<cuDataType *>(b_acc);
+            auto c_ = sc.get_mem<cuDataType *>(c_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower),
                               get_cublas_operation(trans), n, k, (cuDataType *)&alpha, a_, lda, b_,
@@ -315,12 +314,12 @@ inline void her2k(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(ih, b_acc);
-            auto c_ = sc.get_mem<cuDataType *>(ih, c_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto b_ = sc.get_mem<cuDataType *>(b_acc);
+            auto c_ = sc.get_mem<cuDataType *>(c_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_fill_mode(upper_lower),
                               get_cublas_operation(trans), n, k, (cuDataType *)&alpha, a_, lda, b_,
@@ -356,11 +355,11 @@ inline void trmm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(ih, b_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto b_ = sc.get_mem<cuDataType *>(b_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_side_mode(left_right),
                               get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
@@ -393,11 +392,11 @@ inline void trsm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(ih, a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(ih, b_acc);
+
+            auto a_ = sc.get_mem<cuDataType *>(a_acc);
+            auto b_ = sc.get_mem<cuDataType *>(b_acc);
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC(func, err, handle, get_cublas_side_mode(left_right),
                               get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
@@ -435,9 +434,9 @@ inline cl::sycl::event gemm(Func func, cl::sycl::queue &queue, transpose transa,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto b_ = reinterpret_cast<const cuDataType *>(b);
             auto c_ = reinterpret_cast<cuDataType *>(c);
@@ -485,9 +484,9 @@ inline cl::sycl::event symm(Func func, cl::sycl::queue &queue, side left_right,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto b_ = reinterpret_cast<const cuDataType *>(b);
             auto c_ = reinterpret_cast<cuDataType *>(c);
@@ -528,9 +527,9 @@ inline cl::sycl::event hemm(Func func, cl::sycl::queue &queue, side left_right,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto b_ = reinterpret_cast<const cuDataType *>(b);
             auto c_ = reinterpret_cast<cuDataType *>(c);
@@ -568,9 +567,9 @@ inline cl::sycl::event syrk(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto c_ = reinterpret_cast<cuDataType *>(c);
             cublasStatus_t err;
@@ -611,9 +610,9 @@ inline cl::sycl::event herk(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto c_ = reinterpret_cast<cuDataType *>(c);
             cublasStatus_t err;
@@ -651,9 +650,9 @@ inline cl::sycl::event syr2k(Func func, cl::sycl::queue &queue, uplo upper_lower
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto b_ = reinterpret_cast<const cuDataType *>(b);
             auto c_ = reinterpret_cast<cuDataType *>(c);
@@ -695,9 +694,9 @@ inline cl::sycl::event her2k(Func func, cl::sycl::queue &queue, uplo upper_lower
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto b_ = reinterpret_cast<const cuDataType *>(b);
             auto c_ = reinterpret_cast<cuDataType *>(c);
@@ -741,9 +740,9 @@ inline cl::sycl::event trmm(Func func, cl::sycl::queue &queue, side left_right,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto b_ = reinterpret_cast<cuDataType *>(b);
             cublasStatus_t err;
@@ -783,9 +782,9 @@ inline cl::sycl::event trsm(Func func, cl::sycl::queue &queue, side left_right,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        cgh.interop_task([=](cl::sycl::interop_handler ih) {
-            auto sc = CublasScopedContextHandler(queue);
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
             auto handle = sc.get_handle(queue);
+
             auto a_ = reinterpret_cast<const cuDataType *>(a);
             auto b_ = reinterpret_cast<cuDataType *>(b);
             cublasStatus_t err;
diff --git a/src/blas/backends/cublas/cublas_scope_handle.cpp b/src/blas/backends/cublas/cublas_scope_handle.cpp
index e6ce1b4dc..beab19501 100644
--- a/src/blas/backends/cublas/cublas_scope_handle.cpp
+++ b/src/blas/backends/cublas/cublas_scope_handle.cpp
@@ -48,7 +48,7 @@ cublas_handle::~cublas_handle() noexcept(false) {
  */
 thread_local cublas_handle CublasScopedContextHandler::handle_helper = cublas_handle{};
 
-CublasScopedContextHandler::CublasScopedContextHandler(cl::sycl::queue queue) {
+CublasScopedContextHandler::CublasScopedContextHandler(cl::sycl::queue queue, cl::sycl::interop_handler& ih): ih(ih){
     placedContext_ = queue.get_context();
     auto device = queue.get_device();
     auto desired = cl::sycl::get_native<cl::sycl::backend::cuda>(placedContext_);
diff --git a/src/blas/backends/cublas/cublas_scope_handle.hpp b/src/blas/backends/cublas/cublas_scope_handle.hpp
index e73c63225..6aca620d0 100644
--- a/src/blas/backends/cublas/cublas_scope_handle.hpp
+++ b/src/blas/backends/cublas/cublas_scope_handle.hpp
@@ -68,12 +68,13 @@ class CublasScopedContextHandler {
     CUcontext original_;
     cl::sycl::context placedContext_;
     bool needToRecover_;
+    cl::sycl::interop_handler& ih;
     static thread_local cublas_handle handle_helper;
     CUstream get_stream(const cl::sycl::queue &queue);
     cl::sycl::context get_context(const cl::sycl::queue &queue);
 
 public:
-    CublasScopedContextHandler(cl::sycl::queue queue);
+    CublasScopedContextHandler(cl::sycl::queue queue, cl::sycl::interop_handler& ih);
 
     ~CublasScopedContextHandler() noexcept(false);
     /**
@@ -87,7 +88,7 @@ class CublasScopedContextHandler {
     // This is a work-around function for reinterpret_casting the memory. This
     // will be fixed when SYCL-2020 has been implemented for Pi backend.
     template <typename T, typename U>
-    inline T get_mem(cl::sycl::interop_handler ih, U acc) {
+    inline T get_mem(U acc) {
         CUdeviceptr cudaPtr = ih.get_mem<cl::sycl::backend::cuda>(acc);
         return reinterpret_cast<T>(cudaPtr);
     }
diff --git a/src/blas/backends/cublas/cublas_task.hpp b/src/blas/backends/cublas/cublas_task.hpp
new file mode 100644
index 000000000..fd150cc37
--- /dev/null
+++ b/src/blas/backends/cublas/cublas_task.hpp
@@ -0,0 +1,33 @@
+#ifndef _MKL_BLAS_CUBLAS_TASK_HPP_
+#define _MKL_BLAS_CUBLAS_TASK_HPP_
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <complex>
+#include <CL/sycl.hpp>
+#include "oneapi/mkl/types.hpp"
+#include "cublas_scope_handle.hpp"
+#include <CL/sycl/detail/pi.hpp>
+
+namespace oneapi {
+namespace mkl {
+namespace blas {
+namespace cublas {
+
+template <typename H, typename F>
+static inline auto host_task_internal(H &cgh, cl::sycl::queue queue, F f) -> decltype(cgh.interop_task(f)) {
+    cgh.interop_task([f, queue](cl::sycl::interop_handler ih){
+        auto sc = CublasScopedContextHandler(queue, ih);
+        f(sc);
+    });
+}
+
+template <typename H, typename F>
+static inline void onemkl_cublas_host_task(H &cgh, cl::sycl::queue queue, F f) {
+    (void)host_task_internal(cgh, queue, f);
+}
+
+} // namespace cublas
+} // namespace blas
+} // namespace mkl
+} // namespace oneapi
+#endif // _MKL_BLAS_CUBLAS_TASK_HPP_
\ No newline at end of file

From 761b5e64840031d7c934b11e27b7eebda2dc6966 Mon Sep 17 00:00:00 2001
From: sbalint98 <sbsbalint14@gmail.com>
Date: Mon, 31 May 2021 15:04:18 +0200
Subject: [PATCH 2/4] [cublas] pass sc by reference

---
 src/blas/backends/cublas/cublas_batch.cpp  |  6 +-
 src/blas/backends/cublas/cublas_level1.cpp | 56 ++++++-------
 src/blas/backends/cublas/cublas_level2.cpp | 92 +++++++++++-----------
 src/blas/backends/cublas/cublas_level3.cpp | 38 ++++-----
 4 files changed, 96 insertions(+), 96 deletions(-)

diff --git a/src/blas/backends/cublas/cublas_batch.cpp b/src/blas/backends/cublas/cublas_batch.cpp
index 6bc6d73e8..26ee4b2e5 100644
--- a/src/blas/backends/cublas/cublas_batch.cpp
+++ b/src/blas/backends/cublas/cublas_batch.cpp
@@ -42,7 +42,7 @@ inline void gemm_batch(Func func, cl::sycl::queue &queue, transpose transa, tran
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -122,7 +122,7 @@ inline cl::sycl::event gemm_batch(Func func, cl::sycl::queue &queue, transpose t
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -170,7 +170,7 @@ inline cl::sycl::event gemm_batch(Func func, cl::sycl::queue &queue, transpose *
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             int64_t offset = 0;
diff --git a/src/blas/backends/cublas/cublas_level1.cpp b/src/blas/backends/cublas/cublas_level1.cpp
index a9e2b7274..9a5909a0a 100644
--- a/src/blas/backends/cublas/cublas_level1.cpp
+++ b/src/blas/backends/cublas/cublas_level1.cpp
@@ -43,7 +43,7 @@ inline void asum(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto res_acc = result.template get_access<cl::sycl::access::mode::write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
             // when the data is on buffer, it must be set to
@@ -83,7 +83,7 @@ inline void scal(Func func, cl::sycl::queue &queue, int64_t n, T1 a, cl::sycl::b
     overflow_check(n, incx);
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
             auto x_ = sc.get_mem<cuDataType2 *>(x_acc);
             cublasStatus_t err;
@@ -114,7 +114,7 @@ inline void axpy(Func func, cl::sycl::queue &queue, int64_t n, T alpha, cl::sycl
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto x_ = sc.get_mem<cuDataType *>(x_acc);
@@ -148,7 +148,7 @@ inline void rotg(Func func, cl::sycl::queue &queue, cl::sycl::buffer<T1, 1> &a,
         auto b_acc = b.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto s_acc = s.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
@@ -194,7 +194,7 @@ inline void rotm(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto param_acc = param.template get_access<cl::sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
@@ -234,7 +234,7 @@ inline void copy(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto x_ = sc.get_mem<cuDataType *>(x_acc);
@@ -267,7 +267,7 @@ inline void dot(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<T
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read>(cgh);
         auto res_acc = result.template get_access<cl::sycl::access::mode::write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
@@ -313,7 +313,7 @@ inline void rot(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<T
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
             // when the data is on buffer, it must be set to
@@ -351,7 +351,7 @@ void sdsdot(cl::sycl::queue &queue, int64_t n, float sb, cl::sycl::buffer<float,
         auto x_acc = x.get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.get_access<cl::sycl::access::mode::read>(cgh);
         auto res_acc = result.get_access<cl::sycl::access::mode::write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
@@ -393,7 +393,7 @@ inline void rotmg(Func func, cl::sycl::queue &queue, cl::sycl::buffer<T, 1> &d1,
         auto x1_acc = x1.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto y1_acc = y1_buff.template get_access<cl::sycl::access::mode::read>(cgh);
         auto param_acc = param.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
@@ -444,7 +444,7 @@ inline void iamax(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto int_res_acc = int_res_buff.template get_access<cl::sycl::access::mode::write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
@@ -491,7 +491,7 @@ inline void swap(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto x_ = sc.get_mem<cuDataType *>(x_acc);
@@ -530,7 +530,7 @@ inline void iamin(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto int_res_acc = int_res_buff.template get_access<cl::sycl::access::mode::write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
@@ -577,7 +577,7 @@ inline void nrm2(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer<
     queue.submit([&](cl::sycl::handler &cgh) {
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto res_acc = result.template get_access<cl::sycl::access::mode::write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
@@ -626,7 +626,7 @@ inline cl::sycl::event asum(Func func, cl::sycl::queue &queue, int64_t n, const
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto x_ = reinterpret_cast<const cuDataType1 *>(x);
@@ -662,7 +662,7 @@ inline cl::sycl::event scal(Func func, cl::sycl::queue &queue, int64_t n, T1 a,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto x_ = reinterpret_cast<cuDataType2 *>(x);
@@ -698,7 +698,7 @@ inline cl::sycl::event axpy(Func func, cl::sycl::queue &queue, int64_t n, T alph
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto x_ = reinterpret_cast<const cuDataType *>(x);
@@ -733,7 +733,7 @@ inline cl::sycl::event rotg(Func func, cl::sycl::queue &queue, T1 *a, T1 *b, T2
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<cuDataType1 *>(a);
@@ -770,7 +770,7 @@ inline cl::sycl::event rotm(Func func, cl::sycl::queue &queue, int64_t n, T *x,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto x_ = reinterpret_cast<cuDataType *>(x);
@@ -805,7 +805,7 @@ inline cl::sycl::event copy(Func func, cl::sycl::queue &queue, int64_t n, const
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto x_ = reinterpret_cast<const cuDataType *>(x);
@@ -841,7 +841,7 @@ inline cl::sycl::event dot(Func func, cl::sycl::queue &queue, int64_t n, const T
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto x_ = reinterpret_cast<const cuDataType *>(x);
@@ -881,7 +881,7 @@ inline cl::sycl::event rot(Func func, cl::sycl::queue &queue, int64_t n, T1 *x,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto x_ = reinterpret_cast<cuDataType1 *>(x);
@@ -917,7 +917,7 @@ cl::sycl::event sdsdot(cl::sycl::queue &queue, int64_t n, float sb, const float
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto x_ = reinterpret_cast<const float *>(x);
@@ -946,7 +946,7 @@ inline cl::sycl::event rotmg(Func func, cl::sycl::queue &queue, T *d1, T *d2, T
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto d1_ = reinterpret_cast<cuDataType *>(d1);
@@ -990,7 +990,7 @@ inline cl::sycl::event iamax(Func func, cl::sycl::queue &queue, int64_t n, const
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto x_ = reinterpret_cast<const cuDataType *>(x);
@@ -1029,7 +1029,7 @@ inline cl::sycl::event swap(Func func, cl::sycl::queue &queue, int64_t n, T *x,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto x_ = reinterpret_cast<cuDataType *>(x);
@@ -1072,7 +1072,7 @@ inline cl::sycl::event iamin(Func func, cl::sycl::queue &queue, int64_t n, const
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto x_ = reinterpret_cast<const cuDataType *>(x);
@@ -1112,7 +1112,7 @@ inline cl::sycl::event nrm2(Func func, cl::sycl::queue &queue, int64_t n, const
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto x_ = reinterpret_cast<const cuDataType1 *>(x);
diff --git a/src/blas/backends/cublas/cublas_level2.cpp b/src/blas/backends/cublas/cublas_level2.cpp
index 94fa79093..8e7122690 100644
--- a/src/blas/backends/cublas/cublas_level2.cpp
+++ b/src/blas/backends/cublas/cublas_level2.cpp
@@ -40,7 +40,7 @@ inline void gemv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -78,7 +78,7 @@ inline void gbmv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -116,7 +116,7 @@ inline void ger(Func func, cl::sycl::queue &queue, int64_t m, int64_t n, T alpha
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -154,7 +154,7 @@ inline void hbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -189,7 +189,7 @@ inline void hemv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -225,7 +225,7 @@ inline void her(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -259,7 +259,7 @@ inline void her2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -294,7 +294,7 @@ inline void hpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -328,7 +328,7 @@ inline void hpr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -361,7 +361,7 @@ inline void hpr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -396,7 +396,7 @@ inline void sbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -432,7 +432,7 @@ inline void symv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -466,7 +466,7 @@ inline void syr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -502,7 +502,7 @@ inline void syr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -540,7 +540,7 @@ inline void spmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -573,7 +573,7 @@ inline void spr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -606,7 +606,7 @@ inline void spr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read_write>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read>(cgh);
         auto y_acc = y.template get_access<cl::sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -640,7 +640,7 @@ inline void tbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -676,7 +676,7 @@ inline void tbsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -712,7 +712,7 @@ inline void tpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -748,7 +748,7 @@ inline void tpsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -784,7 +784,7 @@ inline void trmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -820,7 +820,7 @@ inline void trsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto x_acc = x.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -861,7 +861,7 @@ inline cl::sycl::event gemv(Func func, cl::sycl::queue &queue, transpose trans,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -903,7 +903,7 @@ inline cl::sycl::event gbmv(Func func, cl::sycl::queue &queue, transpose trans,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -944,7 +944,7 @@ inline cl::sycl::event ger(Func func, cl::sycl::queue &queue, int64_t m, int64_t
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<cuDataType *>(a);
@@ -986,7 +986,7 @@ inline cl::sycl::event hbmv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -1026,7 +1026,7 @@ inline cl::sycl::event hemv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -1068,7 +1068,7 @@ inline cl::sycl::event her(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<cuDataType *>(a);
@@ -1105,7 +1105,7 @@ inline cl::sycl::event her2(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<cuDataType *>(a);
@@ -1144,7 +1144,7 @@ inline cl::sycl::event hpmv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -1184,7 +1184,7 @@ inline cl::sycl::event hpr(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<cuDataType *>(a);
@@ -1220,7 +1220,7 @@ inline cl::sycl::event hpr2(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<cuDataType *>(a);
@@ -1259,7 +1259,7 @@ inline cl::sycl::event sbmv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -1300,7 +1300,7 @@ inline cl::sycl::event symv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -1340,7 +1340,7 @@ inline cl::sycl::event syr(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<cuDataType *>(a);
@@ -1378,7 +1378,7 @@ inline cl::sycl::event syr2(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<cuDataType *>(a);
@@ -1420,7 +1420,7 @@ inline cl::sycl::event spmv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -1459,7 +1459,7 @@ inline cl::sycl::event spr(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<cuDataType *>(a);
@@ -1495,7 +1495,7 @@ inline cl::sycl::event spr2(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<cuDataType *>(a);
@@ -1534,7 +1534,7 @@ inline cl::sycl::event tbmv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -1576,7 +1576,7 @@ inline cl::sycl::event tbsv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -1617,7 +1617,7 @@ inline cl::sycl::event tpmv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -1657,7 +1657,7 @@ inline cl::sycl::event tpsv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -1697,7 +1697,7 @@ inline cl::sycl::event trmv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -1738,7 +1738,7 @@ inline cl::sycl::event trsv(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
diff --git a/src/blas/backends/cublas/cublas_level3.cpp b/src/blas/backends/cublas/cublas_level3.cpp
index ad06517e4..5cdbd495d 100644
--- a/src/blas/backends/cublas/cublas_level3.cpp
+++ b/src/blas/backends/cublas/cublas_level3.cpp
@@ -41,7 +41,7 @@ inline void gemm(Func func, cl::sycl::queue &queue, transpose transa, transpose
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
             auto b_ = sc.get_mem<cuDataType *>(b_acc);
@@ -84,7 +84,7 @@ inline void gemm(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C,
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType_A *>(a_acc);
@@ -125,7 +125,7 @@ inline void symm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -165,7 +165,7 @@ inline void hemm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -200,7 +200,7 @@ inline void syrk(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -237,7 +237,7 @@ inline void herk(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -273,7 +273,7 @@ inline void syr2k(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -314,7 +314,7 @@ inline void her2k(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read>(cgh);
         auto c_acc = c.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -355,7 +355,7 @@ inline void trmm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -392,7 +392,7 @@ inline void trsm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_
     queue.submit([&](cl::sycl::handler &cgh) {
         auto a_acc = a.template get_access<cl::sycl::access::mode::read>(cgh);
         auto b_acc = b.template get_access<cl::sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = sc.get_mem<cuDataType *>(a_acc);
@@ -434,7 +434,7 @@ inline cl::sycl::event gemm(Func func, cl::sycl::queue &queue, transpose transa,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -484,7 +484,7 @@ inline cl::sycl::event symm(Func func, cl::sycl::queue &queue, side left_right,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -527,7 +527,7 @@ inline cl::sycl::event hemm(Func func, cl::sycl::queue &queue, side left_right,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -567,7 +567,7 @@ inline cl::sycl::event syrk(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -610,7 +610,7 @@ inline cl::sycl::event herk(Func func, cl::sycl::queue &queue, uplo upper_lower,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -650,7 +650,7 @@ inline cl::sycl::event syr2k(Func func, cl::sycl::queue &queue, uplo upper_lower
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -694,7 +694,7 @@ inline cl::sycl::event her2k(Func func, cl::sycl::queue &queue, uplo upper_lower
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -740,7 +740,7 @@ inline cl::sycl::event trmm(Func func, cl::sycl::queue &queue, side left_right,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);
@@ -782,7 +782,7 @@ inline cl::sycl::event trsm(Func func, cl::sycl::queue &queue, side left_right,
         for (int64_t i = 0; i < num_events; i++) {
             cgh.depends_on(dependencies[i]);
         }
-        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler sc) {
+        onemkl_cublas_host_task(cgh, queue,[=](CublasScopedContextHandler& sc) {
             auto handle = sc.get_handle(queue);
 
             auto a_ = reinterpret_cast<const cuDataType *>(a);

From 73a7f14389f6464667925426a24f5a934bf291b0 Mon Sep 17 00:00:00 2001
From: sbalint98 <sbsbalint14@gmail.com>
Date: Mon, 31 May 2021 15:43:14 +0200
Subject: [PATCH 3/4] [cublas] remove double include from cublas_level1.cpp

---
 src/blas/backends/cublas/cublas_level1.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/blas/backends/cublas/cublas_level1.cpp b/src/blas/backends/cublas/cublas_level1.cpp
index 9a5909a0a..61e75d223 100644
--- a/src/blas/backends/cublas/cublas_level1.cpp
+++ b/src/blas/backends/cublas/cublas_level1.cpp
@@ -19,7 +19,6 @@
 #include "cublas_helper.hpp"
 #include "cublas_task.hpp"
 
-#include "cublas_task.hpp"
 #include "oneapi/mkl/exceptions.hpp"
 #include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
 

From 34b7dae765d3763f1f4085b5f18285289112f748 Mon Sep 17 00:00:00 2001
From: sbalint98 <sbsbalint14@gmail.com>
Date: Mon, 31 May 2021 15:47:51 +0200
Subject: [PATCH 4/4] [cublas] remove trailing return type from
 host_task_internal  It was neceassry for SFINAE but we use ifdefs instead

---
 src/blas/backends/cublas/cublas_task.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/blas/backends/cublas/cublas_task.hpp b/src/blas/backends/cublas/cublas_task.hpp
index fd150cc37..04fc239c4 100644
--- a/src/blas/backends/cublas/cublas_task.hpp
+++ b/src/blas/backends/cublas/cublas_task.hpp
@@ -14,7 +14,7 @@ namespace blas {
 namespace cublas {
 
 template <typename H, typename F>
-static inline auto host_task_internal(H &cgh, cl::sycl::queue queue, F f) -> decltype(cgh.interop_task(f)) {
+static inline void host_task_internal(H &cgh, cl::sycl::queue queue, F f) {
     cgh.interop_task([f, queue](cl::sycl::interop_handler ih){
         auto sc = CublasScopedContextHandler(queue, ih);
         f(sc);