Skip to content
Open
Show file tree
Hide file tree
Changes from 45 commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
7884b0e
sampling : add support for backend sampling
danbev Nov 17, 2025
9fe9a00
llama-cli : add backend sampler configuration
danbev Nov 17, 2025
f1f3e68
server : add backend sampling options/configuration
danbev Nov 17, 2025
a3eb847
webui : add backend sampling options
danbev Nov 17, 2025
67d3b8e
ggml : add initial cumsum implementation for CUDA
danbev Nov 17, 2025
71574f9
sampling : enable all backend sampler tests
danbev Nov 18, 2025
4b52e59
graph : do not include llama-model.h
ggerganov Nov 18, 2025
82957a9
sampling : always expose sampled_ids
danbev Nov 18, 2025
311c1a3
sampling : ensure at most one output token per seq
danbev Nov 18, 2025
26be108
CUDA: Optimize argsort for gpu-based token sampling
ORippler Nov 18, 2025
0da7e7d
sampling : remove version from sampler chain
danbev Nov 19, 2025
51fee29
sampling : always populate logits for sampled probs
danbev Nov 19, 2025
7e98ebc
sampling : simplify backend sampling logic decode
danbev Nov 19, 2025
d74eb61
squash! sampling : simplify backend sampling logic decode
danbev Nov 19, 2025
38f408c
common : fix regression caused by extra memory allocations during sam…
ggerganov Nov 19, 2025
18ed4d8
squash! sampling : simplify backend sampling logic decode
danbev Nov 19, 2025
0c660e7
Merge remote-tracking branch 'upstream/master' into backend-sampling
danbev Nov 20, 2025
ed4345b
squash! common : fix regression caused by extra memory allocations du…
danbev Nov 20, 2025
0d28b16
sampling : introduce sampling_info struct
danbev Nov 20, 2025
c162562
sampling : return early if backend sampling is disabled
danbev Nov 21, 2025
61ffe41
sampling : use pinned memory for backend sampling buffers
danbev Nov 21, 2025
9b24393
common, tools : refactor model loading to support backend samplers
danbev Nov 21, 2025
79b8cf2
Merge remote-tracking branch 'upstream/master' into backend-sampling
danbev Nov 21, 2025
65500d0
sampling : add stride variable for clarity
danbev Nov 23, 2025
ae23d2d
sampling: clarify candidate ids usage in comments
danbev Nov 23, 2025
9e273f7
sampling : fix copying both sampled tokens and logits/probs from backend
danbev Nov 23, 2025
50d21aa
tests : cleanup test-backend-sampler.cpp
danbev Nov 24, 2025
7816f0b
Merge remote-tracking branch 'upstream/master' into backend-sampling
danbev Nov 24, 2025
d88ba18
common : remove build-info.cpp from commit [no ci]
danbev Nov 24, 2025
4a90583
sampling : cleanup and clarify output_reserve
danbev Nov 24, 2025
8eb9b47
sampling : remove redundant checks for stride and size [no ci]
danbev Nov 24, 2025
25f3380
sampling : add debug log when backend sampler selects token
danbev Nov 24, 2025
d0bea21
examples : update batched to use backend sampling
danbev Nov 24, 2025
e2d4f08
llama-cli : fix dangling reference to sampler config
ggerganov Nov 24, 2025
b26c706
common : initialize backend samplers
ggerganov Nov 24, 2025
883a870
samplers : add missing cont
ggerganov Nov 24, 2025
a02adf4
sampling : add assertions for contiguous tensors in async copy functions
danbev Nov 24, 2025
2b4c792
Merge remote-tracking branch 'upstream/master' into backend-sampling
danbev Nov 25, 2025
0f17ccd
examples : add info about hybrid sampling in batched [no ci]
danbev Nov 25, 2025
53dca56
Merge remote-tracking branch 'upstream/master' into gpu-sampling
danbev Nov 25, 2025
9e5e09d
sampling : remove backend-dist option (wip)
danbev Nov 25, 2025
ec047e1
Merge remote-tracking branch 'upstream/master' into backend-sampling
danbev Nov 25, 2025
f23b306
CUDA: Add top-k implementation
ORippler Nov 21, 2025
b45d504
sampling : add min-p backend sampler
danbev Nov 26, 2025
4fea191
Use `FetchContent` over CPM as it's bundled with CMake
ORippler Nov 26, 2025
0f7805f
common : add get_active_samplers function to check enabled samplers
danbev Nov 26, 2025
90a3aff
cuda : fix editorconfig-checker warning
danbev Nov 26, 2025
7c2bfb3
Merge remote-tracking branch 'upstream/master' into backend-sampling
danbev Nov 26, 2025
d9d7361
sampling : use argmax for min-p sampling
danbev Nov 27, 2025
51107a0
sampling : fix temperature check to allow zero temperature
danbev Nov 27, 2025
5ea3be2
cuda : fix top-k compilation when CUB is unavailable
danbev Nov 27, 2025
172208a
sampling : add comments about backend sampler [no ci]
danbev Nov 27, 2025
e9d0709
sampling : remove backend sampling chain from common_sampler
danbev Nov 27, 2025
f9889cf
Fix top-k comp & behavior for non-CUB path
ORippler Nov 27, 2025
74be332
sampling : support intermixed backend/cpu samplers
danbev Nov 27, 2025
9ad6522
squash! sampling : support intermixed backend/cpu samplers
danbev Nov 28, 2025
459b7ae
squash! sampling : support intermixed backend/cpu samplers
danbev Nov 28, 2025
117e207
refactor : simplify and improve memory management
ggerganov Nov 28, 2025
333da80
Add initial version for top-p sampling
ORippler Nov 28, 2025
8cac9de
sampling : use logits directly for min-p filtering
danbev Nov 28, 2025
2464d1b
sampling : simplify
ggerganov Nov 28, 2025
fbc8f49
llama : simplify
ggerganov Nov 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1513,6 +1513,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
}
).set_sparam());
add_opt(common_arg(
{"--backend-sampling"},
"enable backend sampling (default: disabled)",
[](common_params & params) {
params.sampling.backend_sampling = true;
}
).set_sparam());
add_opt(common_arg(
{"--pooling"}, "{none,mean,cls,last,rank}",
"pooling type for embeddings, use model default if unspecified",
Expand Down
15 changes: 15 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,21 @@ struct common_init_result common_init_from_params(common_params & params) {

auto cparams = common_context_params_to_llama(params);

// backend sampling initialization
if (params.sampling.backend_sampling) {
llama_sampler * backend_chain = common_sampler_backend_init(model, params.sampling);
if (backend_chain != nullptr) {
iparams.samplers_seq_config.resize(cparams.n_seq_max);
for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
iparams.samplers_seq_config[i] = { i, llama_sampler_clone(backend_chain) };
}
cparams.samplers = iparams.samplers_seq_config.data();
cparams.n_samplers = cparams.n_seq_max;

llama_sampler_free(backend_chain);
}
}

llama_context * lctx = llama_init_from_model(model, cparams);
if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
Expand Down
5 changes: 5 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@ struct common_params_sampling {
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens

bool backend_sampling = false; // enable backend sampling

// print the parameters into a string
std::string print() const;
};
Expand Down Expand Up @@ -654,6 +656,9 @@ struct common_init_result {
llama_context_ptr context;

std::vector<llama_adapter_lora_ptr> lora;

std::vector<llama_sampler_ptr> samplers;
std::vector<llama_sampler_seq_config> samplers_seq_config;
};

struct common_init_result common_init_from_params(common_params & params);
Expand Down
16 changes: 10 additions & 6 deletions common/llguidance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,16 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
}

static llama_sampler_i llama_sampler_llg_i = {
/* .name = */ llama_sampler_llg_name,
/* .accept = */ llama_sampler_llg_accept_impl,
/* .apply = */ llama_sampler_llg_apply,
/* .reset = */ llama_sampler_llg_reset,
/* .clone = */ llama_sampler_llg_clone,
/* .free = */ llama_sampler_llg_free,
/* .name = */ llama_sampler_llg_name,
/* .accept = */ llama_sampler_llg_accept_impl,
/* .apply = */ llama_sampler_llg_apply,
/* .reset = */ llama_sampler_llg_reset,
/* .clone = */ llama_sampler_llg_clone,
/* .free = */ llama_sampler_llg_free,
/* .apply_ggml = */ NULL,
/* .accept_ggml = */ NULL,
/* .set_input_ggml = */ NULL,
/* .set_backend_context = */ NULL,
};

static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,
Expand Down
Loading
Loading