diff --git a/kt-kernel/CMakeLists.txt b/kt-kernel/CMakeLists.txt index 24f092d5..2700e5a4 100644 --- a/kt-kernel/CMakeLists.txt +++ b/kt-kernel/CMakeLists.txt @@ -42,6 +42,9 @@ if(USE_CONDA_TOOLCHAIN) endif() set(CMAKE_C_COMPILER ${CONDA_CC} CACHE FILEPATH "C compiler" FORCE) set(CMAKE_CXX_COMPILER ${CONDA_CXX} CACHE FILEPATH "C++ compiler" FORCE) +elseif(KTRANSFORMERS_CPU_USE_KML) + set(CMAKE_C_COMPILER "/opt/HPCKit/25.1.0/compiler/gcc/bin/gcc" CACHE FILEPATH "C compiler" FORCE) + set(CMAKE_CXX_COMPILER "/opt/HPCKit/25.1.0/compiler/gcc/bin/g++" CACHE FILEPATH "C++ compiler" FORCE) else() # Prefer system compilers explicitly to avoid accidentally picking conda wrappers from PATH if(EXISTS "/usr/bin/gcc" AND EXISTS "/usr/bin/g++") @@ -417,6 +420,9 @@ elseif(KTRANSFORMERS_USE_MUSA) endif() elseif(KTRANSFORMERS_CPU_USE_KML) message(STATUS "KML CPU detected") + include_directories("/opt/HPCKit/25.1.0/kml/gcc/include") + link_directories(/opt/HPCKit/25.1.0/kml/gcc/lib) + else() message(STATUS "No GPU support enabled, building for CPU only") add_compile_definitions(KTRANSFORMERS_CPU_ONLY=1) diff --git a/kt-kernel/examples/test_moe_kml.py b/kt-kernel/examples/test_moe_kml.py index e2819184..97f3e601 100644 --- a/kt-kernel/examples/test_moe_kml.py +++ b/kt-kernel/examples/test_moe_kml.py @@ -110,13 +110,13 @@ def test_moe(quant_mode: str): CPUInfer.submit(moe.warm_up_task()) CPUInfer.sync() elif quant_mode == "int8": - moe = kt_kernel_ext.moe.KMLInt8_MOE(config) + moe = kt_kernel_ext.moe.Int8_KERNEL_MOE(config) CPUInfer.submit(moe.load_weights_task()) CPUInfer.sync() # CPUInfer.submit(moe.warm_up_task()) # CPUInfer.sync() elif quant_mode == "int4": - moe = kt_kernel_ext.moe.KMLInt4_MOE(config) + moe = kt_kernel_ext.moe.Int4_KERNEL_MOE(config) CPUInfer.submit(moe.load_weights_task()) CPUInfer.sync() CPUInfer.submit(moe.warm_up_task()) diff --git a/kt-kernel/ext_bindings.cpp b/kt-kernel/ext_bindings.cpp index 687005af..e518f8fd 100644 --- a/kt-kernel/ext_bindings.cpp +++ b/kt-kernel/ext_bindings.cpp @@ -28,7 +28,7 @@ #include "operators/kml/mla.hpp" #include "operators/kml/mla_int8.hpp" #endif -#include "operators/kml/moe.hpp" + static const bool _is_plain_ = true; #else static const bool _is_plain_ = false; diff --git a/kt-kernel/operators/common.hpp b/kt-kernel/operators/common.hpp index 3fa39a19..e16f78c8 100644 --- a/kt-kernel/operators/common.hpp +++ b/kt-kernel/operators/common.hpp @@ -4,6 +4,7 @@ #include #include "../cpu_backend/worker_pool.h" +#include "../cpu_backend/shared_mem_buffer.h" #include "ggml.h" #if defined(__aarch64__) && defined(CPU_USE_KML) diff --git a/kt-kernel/operators/moe_kernel/api/common.h b/kt-kernel/operators/moe_kernel/api/common.h index 14c91688..22b83187 100644 --- a/kt-kernel/operators/moe_kernel/api/common.h +++ b/kt-kernel/operators/moe_kernel/api/common.h @@ -4,12 +4,12 @@ #include "llama.cpp/ggml.h" #if !defined(CPUINFER_HAS_FLOAT16_T) -using float16_t = ggml_fp16_t; +// using float16_t = ggml_fp16_t; #define CPUINFER_HAS_FLOAT16_T 1 #endif #if !defined(CPUINFER_HAS_BFLOAT16_T) -using bfloat16_t = ggml_bf16_t; +// using bfloat16_t = ggml_bf16_t; #define CPUINFER_HAS_BFLOAT16_T 1 #endif // CPUINFER_HAS_BFLOAT16_T const bool PACKED = true; diff --git a/kt-kernel/operators/moe_kernel/la/kernel.hpp b/kt-kernel/operators/moe_kernel/la/kernel.hpp index 34d55fc0..6aad8a20 100644 --- a/kt-kernel/operators/moe_kernel/la/kernel.hpp +++ b/kt-kernel/operators/moe_kernel/la/kernel.hpp @@ -870,7 +870,7 @@ struct GemmKernelInt4 { int n_end = std::min(n, N_BLOCK * (ith + 1)); return {n_start, n_end}; } - static std::pair split_range_m(int m, int ith, int mth) { + static std::pair split_range_m(int m, int ith, int mth = 0) { int n_start = M_BLOCK * ith; int n_end = std::min(m, M_BLOCK * (ith + 1)); return {n_start, n_end}; @@ -1106,12 +1106,18 @@ struct GemmKernelInt4 { } } // 对第二个维度分块的 apply scale - static void apply_scale(int m, int n, float *c, BufferA *ba, BufferB *bb, BufferC *bc, int ith, int nth, int block) { + static void apply_scale(int m, int n, float *c, BufferA *ba, BufferB *bb, BufferC *bc, int ith, int nth, int block, int jth = -1) { // printf("use split apply scale\n"); auto [n_start, n_end] = split_range_n_block(n, ith, nth, block); + int m_start = 0, m_end = m; + if (jth != -1) { + auto tmp = split_range_m(m, jth); + m_start = tmp.first; + m_end = tmp.second; + } // TODO: 后续用 SVE 来加速 - for (int m_begin = 0; m_begin < m; m_begin += M_STEP) { - for (int i = 0; i < M_STEP && m_begin + i < m; i++) { + for (int m_begin = m_start; m_begin < m_end; m_begin += M_STEP) { + for (int i = 0; i < M_STEP && m_begin + i < m_end; i++) { float *scale_a = ba->get_scale(m, m_begin + i); for (int n_begin = n_start; n_begin < n_end; n_begin += N_STEP) { for (int j = 0; j < N_STEP && n_begin + j < n_end; j++) { diff --git a/kt-kernel/operators/moe_kernel/mat_kernel/kml_kernel/batch_gemm_kernels.cpp b/kt-kernel/operators/moe_kernel/mat_kernel/kml_kernel/batch_gemm_kernels.cpp index 4dc9c847..e1480f98 100644 --- a/kt-kernel/operators/moe_kernel/mat_kernel/kml_kernel/batch_gemm_kernels.cpp +++ b/kt-kernel/operators/moe_kernel/mat_kernel/kml_kernel/batch_gemm_kernels.cpp @@ -1,4 +1,4 @@ -#include "prefillgemm_int4/integer_gemm_kernels.h" +#include "../batch_gemm_api.hpp" #include "utils.hpp" #ifdef __cplusplus extern "C" {