Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 0 additions & 22 deletions .devops/cloud-v-pipeline

This file was deleted.

43 changes: 43 additions & 0 deletions .github/workflows/build-riscv-native.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: Build on RISCV Linux Machine by Cloud-V
on:
workflow_dispatch:
workflow_call:

jobs:
bianbu-riscv64-native: # Bianbu 2.2
runs-on: self-hosted

steps:
- name: Install prerequisites
run: |
sudo apt-get update || true
sudo apt-get install -y libatomic1
- uses: actions/checkout@v4
- name: Setup Riscv
run: |
sudo apt-get update || true
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu \
cmake

- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH

cmake --build build --config Release -j $(nproc)
12 changes: 1 addition & 11 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ jobs:

ubuntu-22-cmake-hip:
runs-on: ubuntu-22.04
container: rocm/dev-ubuntu-22.04:6.0.2
container: rocm/dev-ubuntu-22.04:6.1.2

steps:
- name: Clone
Expand Down Expand Up @@ -471,16 +471,6 @@ jobs:
-DGGML_HIP=ON
cmake --build build --config Release -j $(nproc)

- name: Build with legacy HIP support
id: cmake_build_legacy_hip
run: |
cmake -B build2 -S . \
-DCMAKE_C_COMPILER=hipcc \
-DCMAKE_CXX_COMPILER=hipcc \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DGGML_HIP=ON
cmake --build build2 --config Release -j $(nproc)

ubuntu-22-cmake-musa:
runs-on: ubuntu-22.04
container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
Expand Down
53 changes: 53 additions & 0 deletions .github/workflows/copilot-setup-steps.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
name: "Copilot Setup Steps"

# Automatically run the setup steps when they are changed to allow for easy validation, and
# allow manual testing through the repository's "Actions" tab
on:
workflow_dispatch:
push:
paths:
- .github/workflows/copilot-setup-steps.yml
pull_request:
paths:
- .github/workflows/copilot-setup-steps.yml

jobs:
# The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
copilot-setup-steps:
runs-on: ubuntu-latest

# Set the permissions to the lowest permissions possible needed for your steps.
# Copilot will be given its own token for its operations.
permissions:
# If you want to clone the repository as part of your setup steps, for example to install dependencies, you'll need the `contents: read` permission. If you don't clone the repository in your setup steps, Copilot will do this for you automatically after the steps complete.
contents: read

# You can define any steps you want, and they will run before the agent starts.
# If you do not check out your code, Copilot will do this for you.
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: copilot-setup-steps
evict-old-files: 1d

- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Install Python dependencies
run: |
python3 -m venv .venv
.venv/bin/activate
pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
pip install flake8 pyright
101 changes: 67 additions & 34 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,39 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
// utils
//

// Helper function to parse tensor buffer override strings
static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
std::map<std::string, ggml_backend_buffer_type_t> buft_list;
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
auto * dev = ggml_backend_dev_get(i);
auto * buft = ggml_backend_dev_buffer_type(dev);
if (buft) {
buft_list[ggml_backend_buft_name(buft)] = buft;
}
}

for (const auto & override : string_split<std::string>(value, ',')) {
std::string::size_type pos = override.find('=');
if (pos == std::string::npos) {
throw std::invalid_argument("invalid value");
}
std::string tensor_name = override.substr(0, pos);
std::string buffer_type = override.substr(pos + 1);

if (buft_list.find(buffer_type) == buft_list.end()) {
printf("Available buffer types:\n");
for (const auto & it : buft_list) {
printf(" %s\n", ggml_backend_buft_name(it.second));
}
throw std::invalid_argument("unknown buffer type");
}
// keep strings alive and avoid leaking memory by storing them in a static vector
static std::list<std::string> buft_overrides;
buft_overrides.push_back(tensor_name);
overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
}
}

struct handle_model_result {
bool found_mmproj = false;
common_params_model mmproj;
Expand Down Expand Up @@ -993,6 +1026,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
params.tensor_buft_overrides.push_back({nullptr, nullptr});
}

if (!params.speculative.tensor_buft_overrides.empty()) {
params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr});
}

if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
throw std::runtime_error(string_format(
"error: the supplied chat template is not supported: %s%s\n",
Expand Down Expand Up @@ -2349,40 +2386,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
add_opt(common_arg(
{"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
"override tensor buffer type", [](common_params & params, const std::string & value) {
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
if (buft_list.empty()) {
// enumerate all the devices and add their buffer types to the list
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
auto * dev = ggml_backend_dev_get(i);
auto * buft = ggml_backend_dev_buffer_type(dev);
if (buft) {
buft_list[ggml_backend_buft_name(buft)] = buft;
}
}
}

for (const auto & override : string_split<std::string>(value, ',')) {
std::string::size_type pos = override.find('=');
if (pos == std::string::npos) {
throw std::invalid_argument("invalid value");
}
std::string tensor_name = override.substr(0, pos);
std::string buffer_type = override.substr(pos + 1);

if (buft_list.find(buffer_type) == buft_list.end()) {
printf("Available buffer types:\n");
for (const auto & it : buft_list) {
printf(" %s\n", ggml_backend_buft_name(it.second));
}
throw std::invalid_argument("unknown buffer type");
}
// keep strings alive and avoid leaking memory by storing them in a static vector
static std::list<std::string> buft_overrides;
buft_overrides.push_back(tensor_name);
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
}
parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
}
));
add_opt(common_arg(
{"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--cpu-moe", "-cmoe"},
"keep all Mixture of Experts (MoE) weights in the CPU",
Expand All @@ -2405,6 +2417,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_env("LLAMA_ARG_N_CPU_MOE"));
add_opt(common_arg(
{"--cpu-moe-draft", "-cmoed"},
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
[](common_params & params) {
params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
add_opt(common_arg(
{"--n-cpu-moe-draft", "-ncmoed"}, "N",
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
[](common_params & params, int value) {
if (value < 0) {
throw std::invalid_argument("invalid value");
}
for (int i = 0; i < value; ++i) {
static std::list<std::string> buft_overrides_draft;
buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
}
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
add_opt(common_arg(
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
"number of layers to store in VRAM",
Expand Down Expand Up @@ -3130,7 +3163,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
}
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-tbd", "--threads-batch-draft"}, "N",
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
Expand All @@ -3140,7 +3173,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
}
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-Cd", "--cpu-mask-draft"}, "M",
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ struct common_params_speculative {
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;

ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
Expand Down
2 changes: 2 additions & 0 deletions examples/speculative-simple/speculative-simple.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ int main(int argc, char ** argv) {
}

params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;

common_init_result llama_init_dft = common_init_from_params(params);

//model_dft = llama_init_dft.model.get();
Expand Down
2 changes: 2 additions & 0 deletions examples/speculative/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ int main(int argc, char ** argv) {
}

params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;

common_init_result llama_init_dft = common_init_from_params(params);

model_dft = llama_init_dft.model.get();
Expand Down
5 changes: 0 additions & 5 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,6 @@
# ```
# nixConfig = {
# extra-substituters = [
# # Populated by the CI in ggml-org/llama.cpp
# "https://llama-cpp.cachix.org"
#
# # A development cache for nixpkgs imported with `config.cudaSupport = true`.
# # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
# # This lets one skip building e.g. the CUDA-enabled openmpi.
Expand All @@ -47,10 +44,8 @@
# ];
#
# # Verify these are the same keys as published on
# # - https://app.cachix.org/cache/llama-cpp
# # - https://app.cachix.org/cache/cuda-maintainers
# extra-trusted-public-keys = [
# "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
# "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
# ];
# };
Expand Down
20 changes: 19 additions & 1 deletion ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,8 @@
#define GGML_ROPE_TYPE_MROPE 8
#define GGML_ROPE_TYPE_VISION 24

#define GGML_MROPE_SECTIONS 4

#define GGML_UNUSED(x) (void)(x)

#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
Expand Down Expand Up @@ -1660,7 +1662,7 @@ extern "C" {
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[4],
int sections[GGML_MROPE_SECTIONS],
int mode,
int n_ctx_orig,
float freq_base,
Expand All @@ -1686,6 +1688,22 @@ extern "C" {
float beta_fast,
float beta_slow);

GGML_API struct ggml_tensor * ggml_rope_multi_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[GGML_MROPE_SECTIONS],
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow);

GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
struct ggml_context * ctx,
struct ggml_tensor * a,
Expand Down
Loading
Loading