@@ -136,11 +136,6 @@ __device__ __forceinline__ T from_float(const float& inp) {
136136
137137template <typename T>
138138__device__ __forceinline__ _B16x4 from_floatx4 (const floatx4& inp) {
139- [[maybe_unused]] union tmpcvt {
140- uint16_t u;
141- _Float16 f;
142- __hip_bfloat16 b;
143- } t16;
144139 _B16x4 ret;
145140 if constexpr (std::is_same<T, _Float16>::value) {
146141 union h2cvt {
@@ -169,11 +164,6 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
169164template <typename T>
170165__device__ __forceinline__ _B16x4 addx4 (const _B16x4& inp1,
171166 const _B16x4& inp2) {
172- [[maybe_unused]] union tmpcvt {
173- uint16_t u;
174- _Float16 f;
175- __hip_bfloat16 b;
176- } t1, t2, res;
177167 _B16x4 ret;
178168 if constexpr (std::is_same<T, _Float16>::value) {
179169 union h2cvt {
@@ -325,8 +315,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
325315
326316 constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP (GQA_RATIO, 4 );
327317
328- [[maybe_unused]] __shared__ float shared_qk_max[NWARPS][16 + 1 ];
329- [[maybe_unused]] __shared__ float shared_exp_sum[NWARPS][16 + 1 ];
330318 // shared_logits is used for multiple purposes
331319 __shared__ _B16x4 shared_logits[NWARPS][4 ][16 ][4 ];
332320
@@ -444,8 +432,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
444432 const cache_t * k_ptr2 = k_ptr + kblock_number * kv_block_stride;
445433 const int klocal_token_idx =
446434 TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
447- [[maybe_unused]] const int kglobal_token_idx =
448- partition_start_token_idx + klocal_token_idx;
449435 const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
450436 const cache_t * k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
451437
@@ -1309,9 +1295,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
13091295
13101296 const int context_len = context_lens[seq_idx];
13111297 const int num_partitions = DIVIDE_ROUND_UP (context_len, PARTITION_SIZE);
1312- [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
13131298 const auto warpid = threadIdx .x / WARP_SIZE;
1314- [[maybe_unused]] const auto laneid = threadIdx .x % WARP_SIZE;
13151299
13161300 __shared__ float shared_global_exp_sum;
13171301 // max num partitions supported is warp_size * NPAR_LOOPS
@@ -2080,9 +2064,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
20802064
20812065 const int context_len = context_lens[seq_idx];
20822066 const int num_partitions = DIVIDE_ROUND_UP (context_len, PARTITION_SIZE);
2083- [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
20842067 const int warpid = threadIdx .x / WARP_SIZE;
2085- [[maybe_unused]] const int laneid = threadIdx .x % WARP_SIZE;
20862068
20872069 __shared__ float shared_global_exp_sum;
20882070 // max num partitions supported is warp_size * NPAR_LOOPS
@@ -2816,9 +2798,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
28162798
28172799 const int context_len = context_lens[seq_idx];
28182800 const int num_partitions = DIVIDE_ROUND_UP (context_len, PARTITION_SIZE);
2819- [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
28202801 const int warpid = threadIdx .x / WARP_SIZE;
2821- [[maybe_unused]] const int laneid = threadIdx .x % WARP_SIZE;
28222802
28232803 __shared__ float shared_global_exp_sum;
28242804 // max num partitions supported is warp_size * NPAR_LOOPS
0 commit comments