modelscope
diff --git a/‎csrc/core/kernel/cuda/gemm_lowp/gemm_a16w8_perc_kernel.cu‎
Lines changed: 21 additions & 18 deletions b/‎csrc/core/kernel/cuda/gemm_lowp/gemm_a16w8_perc_kernel.cu‎
Lines changed: 21 additions & 18 deletions
diff --git a/‎csrc/core/kernel/cuda/gemm_lowp/gemm_lowp_utils.cuh‎
Lines changed: 8 additions & 0 deletions b/‎csrc/core/kernel/cuda/gemm_lowp/gemm_lowp_utils.cuh‎
Lines changed: 8 additions & 0 deletions
@@ -1320,33 +1320,32 @@ struct ComputeTile_A16W8_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
 // dequant B
 #pragma unroll
     for (int i = 0; i < WARP_NITER / 2; ++i) {
-      cvt_8bx4_to_16bx4_bias128(BQ_frag[reg_buf_idx][2 * i],
-                                BF_frag[reg_buf_idx][2 * i]);
+      typename HalfType<FType>::T2 B_zero_x2 =
+          num2num2(static_cast<typename HalfType<FType>::T1>(0.f));
+      typename HalfType<FType>::T2 B_zero_y2 =
+          num2num2(static_cast<typename HalfType<FType>::T1>(0.f));
       if (has_zp) {
-        BF_frag[reg_buf_idx][2 * i][0] =
-            __hsub2(BF_frag[reg_buf_idx][2 * i][0], num2num2(B_zero[i].x));
-        BF_frag[reg_buf_idx][2 * i][1] =
-            __hsub2(BF_frag[reg_buf_idx][2 * i][1], num2num2(B_zero[i].x));
+        B_zero_x2 = num2num2(B_zero[i].x);
+        B_zero_y2 = num2num2(B_zero[i].y);
       }
 
-      BF_frag[reg_buf_idx][2 * i][0] =
-          __hmul2(BF_frag[reg_buf_idx][2 * i][0], num2num2(B_scale[i].x));
-      BF_frag[reg_buf_idx][2 * i][1] =
-          __hmul2(BF_frag[reg_buf_idx][2 * i][1], num2num2(B_scale[i].x));
+      cvt_8bx4_to_16bx4_bias128(BQ_frag[reg_buf_idx][2 * i],
+                                BF_frag[reg_buf_idx][2 * i]);
+
+      BF_frag[reg_buf_idx][2 * i][0] = dequantize_func(
+          BF_frag[reg_buf_idx][2 * i][0], num2num2(B_scale[i].x), B_zero_x2);
+      BF_frag[reg_buf_idx][2 * i][1] = dequantize_func(
+          BF_frag[reg_buf_idx][2 * i][1], num2num2(B_scale[i].x), B_zero_x2);
 
       cvt_8bx4_to_16bx4_bias128(BQ_frag[reg_buf_idx][2 * i + 1],
                                 BF_frag[reg_buf_idx][2 * i + 1]);
-      if (has_zp) {
-        BF_frag[reg_buf_idx][2 * i + 1][0] =
-            __hsub2(BF_frag[reg_buf_idx][2 * i + 1][0], num2num2(B_zero[i].y));
-        BF_frag[reg_buf_idx][2 * i + 1][1] =
-            __hsub2(BF_frag[reg_buf_idx][2 * i + 1][1], num2num2(B_zero[i].y));
-      }
 
       BF_frag[reg_buf_idx][2 * i + 1][0] =
-          __hmul2(BF_frag[reg_buf_idx][2 * i + 1][0], num2num2(B_scale[i].y));
+          dequantize_func(BF_frag[reg_buf_idx][2 * i + 1][0],
+                          num2num2(B_scale[i].y), B_zero_y2);
       BF_frag[reg_buf_idx][2 * i + 1][1] =
-          __hmul2(BF_frag[reg_buf_idx][2 * i + 1][1], num2num2(B_scale[i].y));
+          dequantize_func(BF_frag[reg_buf_idx][2 * i + 1][1],
+                          num2num2(B_scale[i].y), B_zero_y2);
     }
   }
 
@@ -1677,6 +1676,10 @@ void ampere_hgemm_A16W8_perc_f16_f16_MtilexNtilex32_mma16816_multistage_AN_BTN32
     const uint32_t K, void* workspace, const int sm_version,
     const SplitKParams fused_gemm_params, const float alpha,
     cudaStream_t stream) {
+  if (sm_version < 0x0800) {
+    throw std::runtime_error(
+        "this kernel is not supported on devices below sm80");
+  }
   int Mtile = fused_gemm_params.Mtile;
   int grid_x = (M + Mtile - 1) / Mtile;
   int Ntile = fused_gemm_params.Ntile;
 
@@ -1030,7 +1030,15 @@ __device__ __forceinline__ void cvt_8bx4_to_16bx4_bias128<__nv_bfloat162>(
 }
 
 static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
   return __bfloat162bfloat162(x);
+#else
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3
+  __builtin_unreachable();
+#else
+  return nv_bfloat162{};
+#endif  // __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3
+#endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
 }
 
 static __device__ half2 inline num2num2(const half x) {
Original file line number	Diff line number	Diff line change
`@@ -1030,7 +1030,15 @@ __device__ __forceinline__ void cvt_8bx4_to_16bx4_bias128<__nv_bfloat162>(`
`1030`	`1030`	`}`
`1031`	`1031`
`1032`	`1032`	`static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {`
	`1033`	`+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800`
`1033`	`1034`	`return __bfloat162bfloat162(x);`
	`1035`	`+#else`
	`1036`	`+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3`
	`1037`	`+ __builtin_unreachable();`
	`1038`	`+#else`
	`1039`	`+ return nv_bfloat162{};`
	`1040`	`+#endif // __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3`
	`1041`	`+#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800`
`1034`	`1042`	`}`
`1035`	`1043`
`1036`	`1044`	`static __device__ half2 inline num2num2(const half x) {`