Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions kt-kernel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ if(USE_CONDA_TOOLCHAIN)
endif()
set(CMAKE_C_COMPILER ${CONDA_CC} CACHE FILEPATH "C compiler" FORCE)
set(CMAKE_CXX_COMPILER ${CONDA_CXX} CACHE FILEPATH "C++ compiler" FORCE)
elseif(KTRANSFORMERS_CPU_USE_KML)
set(CMAKE_C_COMPILER "/opt/HPCKit/25.1.0/compiler/gcc/bin/gcc" CACHE FILEPATH "C compiler" FORCE)
set(CMAKE_CXX_COMPILER "/opt/HPCKit/25.1.0/compiler/gcc/bin/g++" CACHE FILEPATH "C++ compiler" FORCE)
Comment on lines +46 to +47
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Hardcoding absolute paths for compilers makes the build configuration fragile and not portable. This will cause build failures on any machine with a different setup. It's better to define a variable for the root path of the toolkit and use it here.

You should define HPCKIT_ROOT before this block, for example, near the other options:

set(HPCKIT_ROOT "/opt/HPCKit/25.1.0" CACHE PATH "Root directory of the Kunpeng HPC Kit")

Then you can use it in this block and for the include/link directories later.

    set(CMAKE_C_COMPILER   "${HPCKIT_ROOT}/compiler/gcc/bin/gcc" CACHE FILEPATH "C compiler" FORCE)
    set(CMAKE_CXX_COMPILER "${HPCKIT_ROOT}/compiler/gcc/bin/g++" CACHE FILEPATH "C++ compiler" FORCE)

else()
# Prefer system compilers explicitly to avoid accidentally picking conda wrappers from PATH
if(EXISTS "/usr/bin/gcc" AND EXISTS "/usr/bin/g++")
Expand Down Expand Up @@ -417,6 +420,9 @@ elseif(KTRANSFORMERS_USE_MUSA)
endif()
elseif(KTRANSFORMERS_CPU_USE_KML)
message(STATUS "KML CPU detected")
include_directories("/opt/HPCKit/25.1.0/kml/gcc/include")
link_directories(/opt/HPCKit/25.1.0/kml/gcc/lib)
Comment on lines +423 to +424
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

These paths are also hardcoded, which harms portability. Please use the HPCKIT_ROOT variable suggested for the compiler paths. This makes it much easier to adapt the build to different environments.

Also, note that link_directories() is a legacy command. It's better to use target_link_libraries() with the full path to libraries for specific targets to avoid polluting the link path for all targets.

    include_directories("${HPCKIT_ROOT}/kml/gcc/include")
    link_directories(${HPCKIT_ROOT}/kml/gcc/lib)


else()
message(STATUS "No GPU support enabled, building for CPU only")
add_compile_definitions(KTRANSFORMERS_CPU_ONLY=1)
Expand Down
4 changes: 2 additions & 2 deletions kt-kernel/examples/test_moe_kml.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,13 +110,13 @@ def test_moe(quant_mode: str):
CPUInfer.submit(moe.warm_up_task())
CPUInfer.sync()
elif quant_mode == "int8":
moe = kt_kernel_ext.moe.KMLInt8_MOE(config)
moe = kt_kernel_ext.moe.Int8_KERNEL_MOE(config)
CPUInfer.submit(moe.load_weights_task())
CPUInfer.sync()
# CPUInfer.submit(moe.warm_up_task())
# CPUInfer.sync()
elif quant_mode == "int4":
moe = kt_kernel_ext.moe.KMLInt4_MOE(config)
moe = kt_kernel_ext.moe.Int4_KERNEL_MOE(config)
CPUInfer.submit(moe.load_weights_task())
CPUInfer.sync()
CPUInfer.submit(moe.warm_up_task())
Expand Down
2 changes: 1 addition & 1 deletion kt-kernel/ext_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
#include "operators/kml/mla.hpp"
#include "operators/kml/mla_int8.hpp"
#endif
#include "operators/kml/moe.hpp"

static const bool _is_plain_ = true;
#else
static const bool _is_plain_ = false;
Expand Down
1 change: 1 addition & 0 deletions kt-kernel/operators/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <map>

#include "../cpu_backend/worker_pool.h"
#include "../cpu_backend/shared_mem_buffer.h"
#include "ggml.h"

#if defined(__aarch64__) && defined(CPU_USE_KML)
Expand Down
4 changes: 2 additions & 2 deletions kt-kernel/operators/moe_kernel/api/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

#include "llama.cpp/ggml.h"
#if !defined(CPUINFER_HAS_FLOAT16_T)
using float16_t = ggml_fp16_t;
// using float16_t = ggml_fp16_t;
#define CPUINFER_HAS_FLOAT16_T 1
#endif

#if !defined(CPUINFER_HAS_BFLOAT16_T)
using bfloat16_t = ggml_bf16_t;
// using bfloat16_t = ggml_bf16_t;
#define CPUINFER_HAS_BFLOAT16_T 1
#endif // CPUINFER_HAS_BFLOAT16_T
const bool PACKED = true;
Expand Down
14 changes: 10 additions & 4 deletions kt-kernel/operators/moe_kernel/la/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -870,7 +870,7 @@ struct GemmKernelInt4 {
int n_end = std::min(n, N_BLOCK * (ith + 1));
return {n_start, n_end};
}
static std::pair<int, int> split_range_m(int m, int ith, int mth) {
static std::pair<int, int> split_range_m(int m, int ith, int mth = 0) {
int n_start = M_BLOCK * ith;
int n_end = std::min(m, M_BLOCK * (ith + 1));
return {n_start, n_end};
Expand Down Expand Up @@ -1106,12 +1106,18 @@ struct GemmKernelInt4 {
}
}
// 对第二个维度分块的 apply scale
static void apply_scale(int m, int n, float *c, BufferA *ba, BufferB *bb, BufferC *bc, int ith, int nth, int block) {
static void apply_scale(int m, int n, float *c, BufferA *ba, BufferB *bb, BufferC *bc, int ith, int nth, int block, int jth = -1) {
// printf("use split apply scale\n");
auto [n_start, n_end] = split_range_n_block(n, ith, nth, block);
int m_start = 0, m_end = m;
if (jth != -1) {
auto tmp = split_range_m(m, jth);
m_start = tmp.first;
m_end = tmp.second;
}
// TODO: 后续用 SVE 来加速
for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
for (int m_begin = m_start; m_begin < m_end; m_begin += M_STEP) {
for (int i = 0; i < M_STEP && m_begin + i < m_end; i++) {
float *scale_a = ba->get_scale(m, m_begin + i);
Comment on lines +1109 to 1121
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This change is also present in GemmKernelInt8 (lines 692-705). The structs GemmKernelInt8 and GemmKernelInt4 share a lot of duplicated code. While this PR makes them consistent, it also propagates the duplication. To improve maintainability, consider refactoring the common code into a base class or a template. This would reduce redundancy and make future changes easier.

for (int n_begin = n_start; n_begin < n_end; n_begin += N_STEP) {
for (int j = 0; j < N_STEP && n_begin + j < n_end; j++) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "prefillgemm_int4/integer_gemm_kernels.h"
#include "../batch_gemm_api.hpp"
#include "utils.hpp"
#ifdef __cplusplus
extern "C" {
Expand Down
Loading