ggml-org
diff --git a/‎common/sampling.cpp‎
Lines changed: 70 additions & 78 deletions b/‎common/sampling.cpp‎
Lines changed: 70 additions & 78 deletions
diff --git a/‎common/sampling.h‎
Lines changed: 3 additions & 6 deletions b/‎common/sampling.h‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎common/speculative.cpp‎
Lines changed: 1 addition & 1 deletion b/‎common/speculative.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/speculative/speculative.cpp‎
Lines changed: 2 additions & 2 deletions b/‎examples/speculative/speculative.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/llama-context.cpp‎
Lines changed: 8 additions & 7 deletions b/‎src/llama-context.cpp‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎src/llama-graph.cpp‎
Lines changed: 1 addition & 1 deletion b/‎src/llama-graph.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/llama-sampling.cpp‎
Lines changed: 16 additions & 0 deletions b/‎src/llama-sampling.cpp‎
Lines changed: 16 additions & 0 deletions
@@ -104,9 +104,10 @@ struct ring_buffer {
 struct common_sampler {
     common_params_sampling params;
 
-    struct llama_sampler * grmr;
     struct llama_sampler * chain;
 
+    bool grammar;
+
     ring_buffer<llama_token> prev;
 
     std::vector<llama_token_data> cur;
@@ -116,7 +117,6 @@ struct common_sampler {
     void reset() {
         prev.clear();
 
-        llama_sampler_reset(grmr);
         llama_sampler_reset(chain);
     }
 
@@ -184,10 +184,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
 
     lparams.no_perf = params.no_perf;
 
-    struct llama_sampler * grmr;
+    llama_sampler * chain = llama_sampler_chain_init(lparams);
+
+    bool grammar = false;
+    std::vector<llama_sampler *> samplers;
+
     if (params.grammar.compare(0, 11, "%llguidance") == 0) {
 #ifdef LLAMA_USE_LLGUIDANCE
-        grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
+        samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
+        grammar = true;
 #else
         GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
@@ -234,26 +239,20 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
             trigger_patterns_c.push_back(regex.c_str());
         }
 
-        grmr = params.grammar_lazy
-             ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
-                                                        trigger_patterns_c.data(), trigger_patterns_c.size(),
-                                                        trigger_tokens.data(), trigger_tokens.size())
-             :      llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
-        if (!grmr) {
-            return nullptr;
+        if (!params.grammar.empty()) {
+             if (params.grammar_lazy) {
+                 samplers.push_back(
+                         llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
+                             trigger_patterns_c.data(), trigger_patterns_c.size(),
+                             trigger_tokens.data(),     trigger_tokens.size()));
+             } else {
+                 samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
+             }
+
+             grammar = true;
         }
     }
 
-    auto * result = new common_sampler {
-        /* .params = */ params,
-        /* .grmr   = */ grmr,
-        /* .chain  = */ llama_sampler_chain_init(lparams),
-        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
-        /* .cur    = */ {},
-        /* .cur_p  = */ {},
-    };
-
-    std::vector<llama_sampler *> samplers;
     if (params.has_logit_bias()) {
         samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
     }
@@ -316,15 +315,23 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
     }
 
     for (auto * smpl : samplers) {
-        llama_sampler_chain_add(result->chain, smpl);
+        llama_sampler_chain_add(chain, smpl);
     }
 
+    auto * result = new common_sampler {
+        /* .params  = */ params,
+        /* .chain   = */ chain,
+        /* .grammar = */ grammar,
+        /* .prev    = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
+        /* .cur     = */ {},
+        /* .cur_p   = */ {},
+    };
+
     return result;
 }
 
 void common_sampler_free(struct common_sampler * gsmpl) {
     if (gsmpl) {
-        llama_sampler_free(gsmpl->grmr);
         llama_sampler_free(gsmpl->chain);
 
         delete gsmpl;
@@ -334,11 +341,24 @@ void common_sampler_free(struct common_sampler * gsmpl) {
 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
     const auto tm = gsmpl->tm();
 
-    if (accept_grammar) {
-        llama_sampler_accept(gsmpl->grmr, token);
-    }
+    if (gsmpl->grammar) {
+        const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
 
-    llama_sampler_accept(gsmpl->chain, token);
+        for (int i = 0; i < n_smpl; i++) {
+            auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
+
+            // the grammar sampler is always the first one
+            if (i == 0) {
+                if (accept_grammar) {
+                    llama_sampler_accept(smpl, token);
+                }
+            } else {
+                llama_sampler_accept(smpl, token);
+            }
+        }
+    } else {
+        llama_sampler_accept(gsmpl->chain, token);
+    }
 
     gsmpl->prev.push_back(token);
 }
@@ -349,12 +369,12 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
 
 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
     return new common_sampler {
-        /* .params = */ gsmpl->params,
-        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
-        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
-        /* .prev   = */ gsmpl->prev,
-        /* .cur    = */ gsmpl->cur,
-        /* .cur_p  = */ gsmpl->cur_p,
+        /* .params  = */ gsmpl->params,
+        /* .chain   = */ llama_sampler_clone(gsmpl->chain),
+        /* .grammar = */ gsmpl->grammar,
+        /* .prev    = */ gsmpl->prev,
+        /* .cur     = */ gsmpl->cur,
+        /* .cur_p   = */ gsmpl->cur_p,
     };
 }
 
@@ -407,77 +427,49 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
     return gsmpl->chain;
 }
 
-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
+    llama_synchronize(ctx);
+
+    // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
+    const auto tm = gsmpl->tm();
+
+    llama_token id = LLAMA_TOKEN_NULL;
+
     // Check if a backend sampler has already sampled a token in which case we
     // return that token id directly.
     {
-        const llama_token id = llama_get_sampled_token_ith(ctx, idx);
+        id = llama_get_sampled_token_ith(ctx, idx);
 
         if (id != LLAMA_TOKEN_NULL) {
             LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
+
             return id;
         }
     }
 
-    llama_synchronize(ctx);
-
-    // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
-    const auto tm = gsmpl->tm();
-
     gsmpl->set_logits(ctx, idx);
 
-    auto & grmr  = gsmpl->grmr;
     auto & chain = gsmpl->chain;
     auto & cur_p = gsmpl->cur_p; // initialized by set_logits
 
-    if (grammar_first) {
-        llama_sampler_apply(grmr, &cur_p);
-    }
-
     llama_sampler_apply(chain, &cur_p);
 
     GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
 
-    const llama_token id = cur_p.data[cur_p.selected].id;
-
-    if (grammar_first) {
-        return id;
-    }
-
-    // check if it the sampled token fits the grammar
-    {
-        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
-
-        llama_sampler_apply(grmr, &single_token_data_array);
-
-        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-        if (is_valid) {
-            return id;
-        }
-    }
-
-    // resampling:
-    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
-    gsmpl->set_logits(ctx, idx);
-
-    llama_sampler_apply(grmr,  &cur_p);
-    llama_sampler_apply(chain, &cur_p);
-
-    GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
+    id = cur_p.data[cur_p.selected].id;
 
-    return cur_p.data[cur_p.selected].id;
+    return id;
 }
 
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
     GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
 
     std::vector<llama_token> result;
     result.reserve(idxs.size());
 
     size_t i = 0;
     for (; i < draft.size(); i++) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
 
         common_sampler_accept(gsmpl, id, true);
 
@@ -489,7 +481,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
     }
 
     if (i == draft.size()) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
 
         common_sampler_accept(gsmpl, id, true);
 
@@ -499,13 +491,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
     return result;
 }
 
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
     std::vector<int> idxs(draft.size() + 1);
     for (size_t i = 0; i < idxs.size(); ++i) {
         idxs[i] = i;
     }
 
-    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
+    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
 }
 
 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
 
@@ -57,10 +57,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
 // - check if the token fits the grammar (if any)
 // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
 //
-// if grammar_first is true, the grammar is applied before the samplers (slower)
-// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
-//
-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
 
 // generalized version of common_sampler_sample
 //
@@ -78,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
 //
 // returns at least 1 token, up to idxs.size()
 //
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
 
 // assume idxs == [ 0, 1, 2, ..., draft.size() ]
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
 
 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
 
 
@@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft(
     for (int i = 0; i < params.n_draft; ++i) {
         common_batch_clear(batch);
 
-        common_sampler_sample(smpl, ctx_dft, 0, true);
+        common_sampler_sample(smpl, ctx_dft, 0);
 
         const auto * cur_p = common_sampler_get_candidates(smpl, true);
 
 
@@ -242,7 +242,7 @@ int main(int argc, char ** argv) {
                 bool accept = false;
                 if (params.sampling.temp > 0) {
                     // stochastic verification
-                    common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
+                    common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
 
                     auto & dist_tgt = *common_sampler_get_candidates(smpl, true);
 
@@ -491,7 +491,7 @@ int main(int argc, char ** argv) {
                     continue;
                 }
 
-                common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
+                common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft);
 
                 const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl, true);
 
 
@@ -820,7 +820,7 @@ size_t llama_context::get_sampled_logits_count(int32_t idx) {
     output_reorder();
 
     if (sampling.logits == nullptr) {
-        return 0;
+        return model.vocab.n_tokens();
     }
 
     try {
@@ -2977,14 +2977,15 @@ float * llama_get_logits(llama_context * ctx) {
 float * llama_get_logits_ith(llama_context * ctx, int32_t i) {
     ctx->synchronize();
 
-    if (ctx->get_sampled_token_ith(i) != LLAMA_TOKEN_NULL) {
-        return nullptr;
-    }
-    if (ctx->get_sampled_probs_ith(i) != nullptr) {
-        return nullptr;
+    float * res = nullptr;
+
+    res = ctx->get_sampled_logits_ith(i);
+
+    if (!res) {
+        res = ctx->get_logits_ith(i);
     }
 
-    return ctx->get_logits_ith(i);
+    return res;
 }
 
 float * llama_get_embeddings(llama_context * ctx) {
 
@@ -2109,7 +2109,7 @@ void llm_graph_context::build_sampling() const {
             ggml_build_forward_expand(gf, data.probs);
         }
 
-        if (data.logits != logits_seq) {
+        if (data.logits != nullptr) {
             ggml_set_output(data.logits);
             res->t_sampled_logits[seq_id] = data.logits;
             ggml_build_forward_expand(gf, res->t_sampled_logits[seq_id]);
 
@@ -366,23 +366,39 @@ const char * llama_sampler_name(const struct llama_sampler * smpl) {
 }
 
 void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) {
+    if (!smpl) {
+        return;
+    }
+
     if (smpl->iface->accept) {
         smpl->iface->accept(smpl, token);
     }
 }
 
 void llama_sampler_apply(struct llama_sampler * smpl, struct llama_token_data_array * cur_p) {
+    if (!smpl) {
+        return;
+    }
+
     GGML_ASSERT(smpl->iface->apply);
     smpl->iface->apply(smpl, cur_p);
 }
 
 void llama_sampler_reset(struct llama_sampler * smpl) {
+    if (!smpl) {
+        return;
+    }
+
     if (smpl->iface->reset) {
         smpl->iface->reset(smpl);
     }
 }
 
 struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
+    if (!smpl) {
+        return nullptr;
+    }
+
     if (smpl->iface->clone) {
         return smpl->iface->clone(smpl);
     }
Original file line number	Diff line number	Diff line change
`@@ -2109,7 +2109,7 @@ void llm_graph_context::build_sampling() const {`
`2109`	`2109`	`ggml_build_forward_expand(gf, data.probs);`
`2110`	`2110`	`}`
`2111`	`2111`
`2112`		`- if (data.logits != logits_seq) {`
	`2112`	`+ if (data.logits != nullptr) {`
`2113`	`2113`	`ggml_set_output(data.logits);`
`2114`	`2114`	`res->t_sampled_logits[seq_id] = data.logits;`
`2115`	`2115`	`ggml_build_forward_expand(gf, res->t_sampled_logits[seq_id]);`
Original file line number	Diff line number	Diff line change
`@@ -366,23 +366,39 @@ const char * llama_sampler_name(const struct llama_sampler * smpl) {`
`366`	`366`	`}`
`367`	`367`
`368`	`368`	`void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) {`
	`369`	`+ if (!smpl) {`
	`370`	`+ return;`
	`371`	`+ }`
	`372`	`+`
`369`	`373`	`if (smpl->iface->accept) {`
`370`	`374`	`smpl->iface->accept(smpl, token);`
`371`	`375`	`}`
`372`	`376`	`}`
`373`	`377`
`374`	`378`	`void llama_sampler_apply(struct llama_sampler * smpl, struct llama_token_data_array * cur_p) {`
	`379`	`+ if (!smpl) {`
	`380`	`+ return;`
	`381`	`+ }`
	`382`	`+`
`375`	`383`	`GGML_ASSERT(smpl->iface->apply);`
`376`	`384`	`smpl->iface->apply(smpl, cur_p);`
`377`	`385`	`}`
`378`	`386`
`379`	`387`	`void llama_sampler_reset(struct llama_sampler * smpl) {`
	`388`	`+ if (!smpl) {`
	`389`	`+ return;`
	`390`	`+ }`
	`391`	`+`
`380`	`392`	`if (smpl->iface->reset) {`
`381`	`393`	`smpl->iface->reset(smpl);`
`382`	`394`	`}`
`383`	`395`	`}`
`384`	`396`
`385`	`397`	`struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {`
	`398`	`+ if (!smpl) {`
	`399`	`+ return nullptr;`
	`400`	`+ }`
	`401`	`+`
`386`	`402`	`if (smpl->iface->clone) {`
`387`	`403`	`return smpl->iface->clone(smpl);`
`388`	`404`	`}`