ggml-org
diff --git a/‎common/common.cpp‎
Lines changed: 1 addition & 2 deletions b/‎common/common.cpp‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎common/sampling.cpp‎
Lines changed: 20 additions & 34 deletions b/‎common/sampling.cpp‎
Lines changed: 20 additions & 34 deletions
diff --git a/‎common/sampling.h‎
Lines changed: 1 addition & 1 deletion b/‎common/sampling.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/llama.h‎
Lines changed: 10 additions & 3 deletions b/‎include/llama.h‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎src/llama-context.cpp‎
Lines changed: 2 additions & 0 deletions b/‎src/llama-context.cpp‎
Lines changed: 2 additions & 0 deletions
@@ -1098,8 +1098,7 @@ common_init_result::common_init_result(common_params & params) :
 
     for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
         pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
-        llama_sampler * backend_chain = common_sampler_chain_backend(pimpl->samplers[i].get());
-        pimpl->samplers_seq_config[i] = { i, backend_chain };
+        pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
     }
 
     cparams.samplers   = pimpl->samplers_seq_config.data();
 
@@ -106,7 +106,6 @@ struct common_sampler {
 
     struct llama_sampler * grmr;
     struct llama_sampler * chain;
-    struct llama_sampler * chain_backend;
 
     ring_buffer<llama_token> prev;
 
@@ -119,7 +118,6 @@ struct common_sampler {
 
         llama_sampler_reset(grmr);
         llama_sampler_reset(chain);
-        llama_sampler_reset(chain_backend);
     }
 
     void set_logits(struct llama_context * ctx, int idx) {
@@ -247,13 +245,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
     }
 
     auto * result = new common_sampler {
-        /* .params        = */ params,
-        /* .grmr          = */ grmr,
-        /* .chain         = */ llama_sampler_chain_init(lparams),
-        /* .chain_backend = */ llama_sampler_chain_init(lparams),
-        /* .prev          = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
-        /* .cur           = */ {},
-        /* .cur_p         = */ {},
+        /* .params = */ params,
+        /* .grmr   = */ grmr,
+        /* .chain  = */ llama_sampler_chain_init(lparams),
+        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
+        /* .cur    = */ {},
+        /* .cur_p  = */ {},
     };
 
     std::vector<llama_sampler *> samplers;
@@ -318,15 +315,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
         GGML_ASSERT(false && "unknown mirostat version");
     }
 
-    bool is_backend = params.backend_sampling;
-
-    // split in two chains: backend -> CPU
     for (auto * smpl : samplers) {
-        if (!smpl->iface->backend_apply) {
-            is_backend = false;
-        }
-
-        llama_sampler_chain_add(is_backend ? result->chain_backend : result->chain, smpl);
+        llama_sampler_chain_add(result->chain, smpl);
     }
 
     return result;
@@ -336,7 +326,6 @@ void common_sampler_free(struct common_sampler * gsmpl) {
     if (gsmpl) {
         llama_sampler_free(gsmpl->grmr);
         llama_sampler_free(gsmpl->chain);
-        llama_sampler_free(gsmpl->chain_backend);
 
         delete gsmpl;
     }
@@ -360,13 +349,12 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
 
 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
     return new common_sampler {
-        /* .params        = */ gsmpl->params,
-        /* .grmr          = */ llama_sampler_clone(gsmpl->grmr),
-        /* .chain         = */ llama_sampler_clone(gsmpl->chain),
-        /* .chain_backend = */ llama_sampler_clone(gsmpl->chain_backend),
-        /* .prev          = */ gsmpl->prev,
-        /* .cur           = */ gsmpl->cur,
-        /* .cur_p         = */ gsmpl->cur_p,
+        /* .params = */ gsmpl->params,
+        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
+        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
+        /* .prev   = */ gsmpl->prev,
+        /* .cur    = */ gsmpl->cur,
+        /* .cur_p  = */ gsmpl->cur_p,
     };
 }
 
@@ -415,20 +403,22 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
     }
 }
 
-struct llama_sampler * common_sampler_chain_backend(const struct common_sampler * gsmpl) {
-    return gsmpl->chain_backend;
+struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
+    return gsmpl->chain;
 }
 
 llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
     // Check if a backend sampler has already sampled a token in which case we
     // return that token id directly.
     {
         const llama_token id = llama_get_sampled_token_ith(ctx, idx);
+
         if (id != LLAMA_TOKEN_NULL) {
             LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
             return id;
         }
     }
+
     llama_synchronize(ctx);
 
     // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
@@ -556,16 +546,12 @@ llama_token common_sampler_last(const struct common_sampler * gsmpl) {
 }
 
 std::string common_sampler_print(const struct common_sampler * gsmpl) {
-    std::string result = llama_sampler_chain_n(gsmpl->chain_backend) > 0 ? "*logits " : "logits ";
-
-    for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain_backend); i++) {
-        const auto * smpl = llama_sampler_chain_get(gsmpl->chain_backend, i);
-        result += std::string("-> *") + llama_sampler_name(smpl) + " ";
-    }
+    std::string result = "logits ";
 
     for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
         const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
-        result += std::string("-> ") + llama_sampler_name(smpl) + " ";
+        result += std::string("-> ");
+        result += std::string(llama_sampler_name(smpl)) + " ";
     }
 
     return result;
 
@@ -48,7 +48,7 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
 // arguments can be nullptr to skip printing
 void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
 
-struct llama_sampler * common_sampler_chain_backend(const struct common_sampler * gsmpl);
+struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
 
 // extended sampling implementation:
 //
 
@@ -369,7 +369,8 @@ extern "C" {
                           // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
                           // ref: https://github.com/ggml-org/llama.cpp/pull/14363
 
-        // backend sampler chain configuration (does not keep a reference, so make sure the caller keeps the samplers alive)
+        // backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
+        // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
         struct llama_sampler_seq_config * samplers;
         size_t                            n_samplers;
     };
@@ -1193,21 +1194,27 @@ extern "C" {
         struct llama_sampler * (*clone) (const struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
         void                   (*free)  (      struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
 
-        // backend sampling interface
-        void (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
+        // backend sampling interface:
 
+        // return true if the backend supports all ops needed by the sampler
+        // note: call once per sampler
+        bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
+
+        // call after .backend_accept()
         void (*backend_accept)(
                 struct llama_sampler * smpl,
                 struct ggml_context  * ctx,
                 struct ggml_cgraph   * gf,
                 struct ggml_tensor   * selected_token);
 
+        // call after .backend_init()
         void (*backend_apply)(
                 struct llama_sampler      * smpl,
                 struct ggml_context       * ctx,
                 struct ggml_cgraph        * gf,
                 struct llama_sampler_data * data);
 
+        // call before .backend_apply()
         void (*backend_set_input)(struct llama_sampler * smpl);
     };
 
 
@@ -68,6 +68,8 @@ llama_context::llama_context(
         for (size_t i = 0; i < params.n_samplers; ++i) {
             const auto & config = params.samplers[i];
 
+            // TODO: assert this is a llama_sampler_chain instance
+
             if (set_sampler(config.seq_id, config.sampler)) {
                 const int n_samplers = llama_sampler_chain_n(config.sampler);
Original file line number	Diff line number	Diff line change
`@@ -1098,8 +1098,7 @@ common_init_result::common_init_result(common_params & params) :`
`1098`	`1098`
`1099`	`1099`	`for (int i = 0; i < (int) cparams.n_seq_max; ++i) {`
`1100`	`1100`	`pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));`
`1101`		`- llama_sampler * backend_chain = common_sampler_chain_backend(pimpl->samplers[i].get());`
`1102`		`- pimpl->samplers_seq_config[i] = { i, backend_chain };`
	`1101`	`+ pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };`
`1103`	`1102`	`}`
`1104`	`1103`
`1105`	`1104`	`cparams.samplers = pimpl->samplers_seq_config.data();`