@@ -319,15 +319,6 @@ static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
319319#endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
320320}
321321
322- static constexpr __host__ int ggml_cuda_get_physical_warp_size_host () {
323- #if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
324- return 64 ;
325- #else
326- return 32 ;
327- #endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
328- }
329-
330-
331322// Maximum number of bytes that can be copied in a single instruction.
332323static constexpr __device__ int ggml_cuda_get_max_cpy_bytes () {
333324#ifdef GGML_USE_HIP
@@ -470,7 +461,13 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
470461 return x;
471462}
472463
473- static __device__ __forceinline__ unsigned int get_warp_mask () {
464+ #ifdef __HIP_PLATFORM_AMD__
465+ typedef uint64_t ggml_lane_mask_t ;
466+ #else
467+ typedef uint32_t ggml_lane_mask_t ;
468+ #endif // __HIP_PLATFORM_AMD__
469+
470+ static __device__ __forceinline__ ggml_lane_mask_t get_warp_mask () {
474471#ifdef __HIP_PLATFORM_AMD__
475472 return __ballot (1 ); // HIP equivalent
476473#else
@@ -481,10 +478,9 @@ static __device__ __forceinline__ unsigned int get_warp_mask() {
481478template <typename T, int width = WARP_SIZE>
482479static __device__ __forceinline__ T warp_prefix_inclusive_sum (T x) {
483480 const int lane_id = threadIdx .x % width;
484- const auto mask = get_warp_mask ();
485481#pragma unroll
486482 for (int offset = 1 ; offset < width; offset <<= 1 ) {
487- const T t = __shfl_up_sync (mask , x, offset, width);
483+ const T t = __shfl_up_sync (0xffffffff , x, offset, width);
488484 if (lane_id >= offset) {
489485 x += t;
490486 }
@@ -495,11 +491,10 @@ static __device__ __forceinline__ T warp_prefix_inclusive_sum(T x) {
495491template <int width = WARP_SIZE>
496492static __device__ __forceinline__ float2 warp_prefix_inclusive_sum (float2 a) {
497493 const int lane_id = threadIdx .x % width;
498- const auto mask = get_warp_mask ();
499494#pragma unroll
500495 for (int offset = 1 ; offset < width; offset <<= 1 ) {
501- const float t_x = __shfl_up_sync (mask , a.x , offset, width);
502- const float t_y = __shfl_up_sync (mask , a.y , offset, width);
496+ const float t_x = __shfl_up_sync (0xffffffff , a.x , offset, width);
497+ const float t_y = __shfl_up_sync (0xffffffff , a.y , offset, width);
503498 if (lane_id >= offset) {
504499 a.x += t_x;
505500 a.y += t_y;
@@ -512,10 +507,9 @@ template<int width = WARP_SIZE>
512507static __device__ __forceinline__ half2 warp_prefix_inclusive_sum (half2 a) {
513508#ifdef FP16_AVAILABLE
514509 const int lane_id = threadIdx .x % width;
515- const auto mask = get_warp_mask ();
516510#pragma unroll
517511 for (int offset = 1 ; offset < width; offset <<= 1 ) {
518- const half2 t = __shfl_up_sync (mask , a, offset, width);
512+ const half2 t = __shfl_up_sync (0xffffffff , a, offset, width);
519513 if (lane_id >= offset) {
520514 a = __hadd2 (a, t);
521515 }
@@ -951,6 +945,11 @@ const ggml_cuda_device_info & ggml_cuda_info();
951945void ggml_cuda_set_device (int device);
952946int ggml_cuda_get_device ();
953947
948+ static __host__ int ggml_cuda_get_physical_warp_size_host () {
949+ const auto &info = ggml_cuda_info ().devices [ggml_cuda_get_device ()];
950+ return info.warp_size ;
951+ }
952+
954953struct ggml_cuda_pool {
955954 virtual ~ggml_cuda_pool () = default ;
956955
0 commit comments