Skip to content

Commit 86a0853

Browse files
committed
Fix strides; hardcode mask; add ggml_lane_mask_t
1 parent efd619a commit 86a0853

File tree

2 files changed

+19
-20
lines changed

2 files changed

+19
-20
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -319,15 +319,6 @@ static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
319319
#endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
320320
}
321321

322-
static constexpr __host__ int ggml_cuda_get_physical_warp_size_host() {
323-
#if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
324-
return 64;
325-
#else
326-
return 32;
327-
#endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
328-
}
329-
330-
331322
// Maximum number of bytes that can be copied in a single instruction.
332323
static constexpr __device__ int ggml_cuda_get_max_cpy_bytes() {
333324
#ifdef GGML_USE_HIP
@@ -470,7 +461,13 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
470461
return x;
471462
}
472463

473-
static __device__ __forceinline__ unsigned int get_warp_mask() {
464+
#ifdef __HIP_PLATFORM_AMD__
465+
typedef uint64_t ggml_lane_mask_t;
466+
#else
467+
typedef uint32_t ggml_lane_mask_t;
468+
#endif // __HIP_PLATFORM_AMD__
469+
470+
static __device__ __forceinline__ ggml_lane_mask_t get_warp_mask() {
474471
#ifdef __HIP_PLATFORM_AMD__
475472
return __ballot(1); // HIP equivalent
476473
#else
@@ -481,10 +478,9 @@ static __device__ __forceinline__ unsigned int get_warp_mask() {
481478
template<typename T, int width = WARP_SIZE>
482479
static __device__ __forceinline__ T warp_prefix_inclusive_sum(T x) {
483480
const int lane_id = threadIdx.x % width;
484-
const auto mask = get_warp_mask();
485481
#pragma unroll
486482
for (int offset = 1; offset < width; offset <<= 1) {
487-
const T t = __shfl_up_sync(mask, x, offset, width);
483+
const T t = __shfl_up_sync(0xffffffff, x, offset, width);
488484
if (lane_id >= offset) {
489485
x += t;
490486
}
@@ -495,11 +491,10 @@ static __device__ __forceinline__ T warp_prefix_inclusive_sum(T x) {
495491
template<int width = WARP_SIZE>
496492
static __device__ __forceinline__ float2 warp_prefix_inclusive_sum(float2 a) {
497493
const int lane_id = threadIdx.x % width;
498-
const auto mask = get_warp_mask();
499494
#pragma unroll
500495
for (int offset = 1; offset < width; offset <<= 1) {
501-
const float t_x = __shfl_up_sync(mask, a.x, offset, width);
502-
const float t_y = __shfl_up_sync(mask, a.y, offset, width);
496+
const float t_x = __shfl_up_sync(0xffffffff, a.x, offset, width);
497+
const float t_y = __shfl_up_sync(0xffffffff, a.y, offset, width);
503498
if (lane_id >= offset) {
504499
a.x += t_x;
505500
a.y += t_y;
@@ -512,10 +507,9 @@ template<int width = WARP_SIZE>
512507
static __device__ __forceinline__ half2 warp_prefix_inclusive_sum(half2 a) {
513508
#ifdef FP16_AVAILABLE
514509
const int lane_id = threadIdx.x % width;
515-
const auto mask = get_warp_mask();
516510
#pragma unroll
517511
for (int offset = 1; offset < width; offset <<= 1) {
518-
const half2 t = __shfl_up_sync(mask, a, offset, width);
512+
const half2 t = __shfl_up_sync(0xffffffff, a, offset, width);
519513
if (lane_id >= offset) {
520514
a = __hadd2(a, t);
521515
}
@@ -951,6 +945,11 @@ const ggml_cuda_device_info & ggml_cuda_info();
951945
void ggml_cuda_set_device(int device);
952946
int ggml_cuda_get_device();
953947

948+
static __host__ int ggml_cuda_get_physical_warp_size_host() {
949+
const auto &info = ggml_cuda_info().devices[ggml_cuda_get_device()];
950+
return info.warp_size;
951+
}
952+
954953
struct ggml_cuda_pool {
955954
virtual ~ggml_cuda_pool() = default;
956955

ggml/src/ggml-cuda/cumsum.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ static __global__ void cumsum_cub_kernel(
1313
const T * __restrict__ src,
1414
T * __restrict__ dst,
1515
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
16-
const int64_t nb01, const int64_t nb02, const int64_t nb03,
17-
const int64_t nb1, const int64_t nb2, const int64_t nb3) {
16+
const int64_t s01, const int64_t s02, const int64_t s03,
17+
const int64_t s1, const int64_t s2, const int64_t s3) {
1818
#ifdef GGML_CUDA_USE_CUB
1919
using BlockScan = cub::BlockScan<T, BLOCK_SIZE>;
2020

@@ -168,7 +168,7 @@ static void cumsum_cuda(
168168
}
169169
#endif // GGML_CUDA_USE_CUB
170170
dim3 grid_dims(ne01, ne02, ne03);
171-
constexpr int warp_size = ggml_cuda_get_physical_warp_size_host();
171+
const int warp_size = ggml_cuda_get_physical_warp_size_host();
172172
const int num_warps = (ne00 + warp_size - 1) / warp_size;
173173
int block_size = num_warps * warp_size;
174174
block_size = std::min(block_size, CUDA_CUMSUM_BLOCK_SIZE);

0 commit comments

Comments
 (0)