ikawrakow
diff --git a/‎common/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎common/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 34 additions & 5 deletions b/‎common/common.cpp‎
Lines changed: 34 additions & 5 deletions
diff --git a/‎common/common.h‎
Lines changed: 6 additions & 1 deletion b/‎common/common.h‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎common/sampling.cpp‎
Lines changed: 47 additions & 1 deletion b/‎common/sampling.cpp‎
Lines changed: 47 additions & 1 deletion
diff --git a/‎common/sampling.h‎
Lines changed: 10 additions & 0 deletions b/‎common/sampling.h‎
Lines changed: 10 additions & 0 deletions
@@ -76,6 +76,7 @@ add_library(${TARGET} STATIC
     minja.hpp
     ngram-cache.h
     ngram-cache.cpp
+    speculative.cpp
     )
 
 if (BUILD_SHARED_LIBS)
 
@@ -505,6 +505,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.n_ctx = std::stoi(argv[i]);
         return true;
     }
+    if (arg == "-cd" || arg == "--ctx-size-draft") {
+        CHECK_ARG
+        params.n_ctx_draft = std::stoi(argv[i]);
+        return true;
+    }
     if (arg == "--grp-attn-n" || arg == "-gan") {
         CHECK_ARG
         params.grp_attn_n = std::stoi(argv[i]);
@@ -725,7 +730,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             }
         }
         return true;
-    }  
+    }
     if (arg == "--cfg-negative-prompt") {
         CHECK_ARG
         sparams.cfg_negative_prompt = argv[i];
@@ -765,11 +770,21 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.n_keep = std::stoi(argv[i]);
         return true;
     }
-    if (arg == "--draft") {
+    if (arg == "--draft" || arg == "--draft-max" || arg == "--draft-n") {
         CHECK_ARG
         params.n_draft = std::stoi(argv[i]);
         return true;
     }
+    if (arg == "--draft-min" || arg == "--draft-n-min") {
+        CHECK_ARG
+        params.n_draft_min = std::stoi(argv[i]);
+        return true;
+    }
+    if (arg == "--draft-p-min") {
+        CHECK_ARG
+        params.p_draft_min = std::stof(argv[i]);
+        return true;
+    }
     if (arg == "--chunks") {
         CHECK_ARG
         params.n_chunks = std::stoi(argv[i]);
@@ -934,6 +949,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.cache_type_v = argv[++i];
         return true;
     }
+    if (arg == "-ctkd" || arg == "--cache-type-k-draft") {
+        params.cache_type_k_draft = argv[++i];
+        return true;
+    }
+    if (arg == "-ctvd" || arg == "--cache-type-v-draft") {
+        params.cache_type_v_draft = argv[++i];
+        return true;
+    }
     if (arg == "-mli" || arg == "--multiline-input") {
         params.multiline_input = true;
         return true;
@@ -1071,7 +1094,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         size_t pos = 0;
         while ((pos = servers.find(",")) != std::string::npos) {
             std::string server = servers.substr(0, pos);
-            ggml_backend_rpc_buffer_type(server.c_str());            
+            ggml_backend_rpc_buffer_type(server.c_str());
             servers.erase(0, pos + 1);
         }
         ggml_backend_rpc_buffer_type(servers.c_str());
@@ -1693,14 +1716,14 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "speculative", "-td,   --threads-draft N",      "number of threads to use during generation (default: same as --threads)" });
     options.push_back({ "speculative", "-tbd,  --threads-batch-draft N",
                                                                         "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
-    options.push_back({ "speculative", "       --draft N",              "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
     options.push_back({ "speculative", "-ps,   --p-split N",            "speculative decoding split probability (default: %.1f)", (double)params.p_split });
     options.push_back({ "*",           "-lcs,  --lookup-cache-static FNAME",
                                                                         "path to static lookup cache to use for lookup decoding (not updated by generation)" });
     options.push_back({ "*",           "-lcd,  --lookup-cache-dynamic FNAME",
                                                                         "path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
 
     options.push_back({ "*",           "-c,    --ctx-size N",           "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
+    options.push_back({ "*",           "-cd,   --ctx-size-draft N",     "size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.n_ctx_draft });
     options.push_back({ "*",           "-n,    --predict N",            "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
     options.push_back({ "*",           "-b,    --batch-size N",         "logical maximum batch size (default: %d)", params.n_batch });
     options.push_back({ "*",           "-ub,   --ubatch-size N",        "physical maximum batch size (default: %d)", params.n_ubatch });
@@ -1811,6 +1834,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "-nkvo, --no-kv-offload",        "disable KV offload" });
     options.push_back({ "*",           "-ctk,  --cache-type-k TYPE",    "KV cache data type for K (default: %s)", params.cache_type_k.c_str() });
     options.push_back({ "*",           "-ctv,  --cache-type-v TYPE",    "KV cache data type for V (default: %s)", params.cache_type_v.c_str() });
+    options.push_back({ "*",           "-ctkd, --cache-type-k-draft TYPE", "KV cache data type for K for the draft model" });
+    options.push_back({ "*",           "-ctvd, --cache-type-v-draft TYPE", "KV cache data type for V for the draft model" });
 
     options.push_back({ "perplexity" });
     options.push_back({ "perplexity",  "       --all-logits",           "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" });
@@ -1893,6 +1918,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "-hfr,  --hf-repo REPO",         "Hugging Face model repository (default: unused)" });
     options.push_back({ "*",           "-hff,  --hf-file FILE",         "Hugging Face model file (default: unused)" });
     options.push_back({ "*",           "-hft,  --hf-token TOKEN",       "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
+    options.push_back({ "*", "--draft-max, --draft, --draft-n N",
+                                                                        "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
+    options.push_back({ "*", "--draft-min, --draft-n-min N",   "minimum number of draft tokens to use for speculative decoding" });
+    options.push_back({ "*", "--draft-p-min P",                "minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.p_draft_min });
 
     options.push_back({ "retrieval" });
     options.push_back({ "retrieval",   "       --context-file FNAME",   "file to load context from (repeat to specify multiple files)" });
@@ -2052,7 +2081,7 @@ std::string string_join(const std::vector<std::string> & strs, const std::string
     if (strs.empty()) {
         return "";
     }
-    
+
     std::ostringstream oss;
     for (size_t i = 0; i < strs.size(); ++i) {
         if (i > 0) {
 
@@ -83,10 +83,13 @@ struct gpt_params {
     int32_t n_threads_batch_draft =    -1;
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =     0; // context size
+    int32_t n_ctx_draft           =     0; // context size for draft model
     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep                =     0; // number of tokens to keep from initial prompt
-    int32_t n_draft               =     5; // number of tokens to draft during speculative decoding
+    int32_t n_draft               =    16; // number of tokens to draft during speculative decoding
+    int32_t n_draft_min           =     1; // minimum number of tokens to draft during speculative decoding
+    float   p_draft_min           =  0.8f; // minimum speculative decoding probability (greedy)
     int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
     int32_t n_parallel            =     1; // number of parallel sequences to decode
     int32_t n_sequences           =     1; // number of sequences to decode
@@ -207,6 +210,8 @@ struct gpt_params {
 
     std::string cache_type_k = "f16"; // KV cache data type for the K
     std::string cache_type_v = "f16"; // KV cache data type for the V
+    std::string cache_type_k_draft = ""; // KV cache data type for K for the draft model
+    std::string cache_type_v_draft = ""; // KV cache data type for V for the draft model
 
     // multimodal models (see examples/llava)
     std::string mmproj = "";        // path to multimodal projector
 
@@ -442,7 +442,9 @@ static llama_token_data_array llama_sampling_prepare_impl(
         cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
     }
 
-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
+    ctx_sampling->cur_p = { cur.data(), cur.size(), false };
+
+    llama_token_data_array & cur_p = ctx_sampling->cur_p;
 
     // apply penalties
     const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
@@ -506,3 +508,47 @@ void llama_sampling_accept(
         llama_sampler_dry_accept(ctx_sampling->smpl, id);
     }
 }
+
+llama_token_data_array * llama_sampling_get_candidates(struct llama_sampling_context * ctx_sampling) {
+    return &ctx_sampling->cur_p;
+}
+
+std::vector<llama_token> llama_sampling_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector<llama_token> & draft) {
+    std::vector<int> idxs(draft.size() + 1);
+    for (size_t i = 0; i < idxs.size(); ++i) {
+        idxs[i] = i;
+    }
+
+    return llama_sampling_sample_and_accept_n(gsmpl, ctx, idxs, draft);
+}
+
+std::vector<llama_token> llama_sampling_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const std::vector<llama_token> & draft) {
+    GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
+
+    std::vector<llama_token> result;
+    result.reserve(idxs.size());
+
+    size_t i = 0;
+    for (; i < draft.size(); i++) {
+        const llama_token id = llama_sampling_sample(gsmpl, ctx, nullptr, idxs[i]);
+
+        llama_sampling_accept(gsmpl, ctx, id, true);
+
+        result.push_back(id);
+
+        if (draft[i] != id) {
+            break;
+        }
+    }
+
+    if (i == draft.size()) {
+        const llama_token id = llama_sampling_sample(gsmpl, ctx, nullptr, idxs[i]);
+
+        llama_sampling_accept(gsmpl, ctx, id, true);
+
+        result.push_back(id);
+    }
+
+    return result;
+}
+
@@ -101,6 +101,8 @@ struct llama_sampling_context {
 
     size_t n_valid; // Number of correct top tokens with correct probabilities.
 
+    llama_token_data_array cur_p; // current candidates
+
     std::mt19937 rng;
 };
 
@@ -176,3 +178,11 @@ void llama_sampling_accept(
         struct llama_context * ctx_main,
         llama_token id,
         bool apply_grammar);
+
+// returns at least 1 token, up to draft.size()
+// access the internal list of current candidate tokens
+llama_token_data_array * llama_sampling_get_candidates(struct llama_sampling_context * ctx_sampling);
+
+std::vector<llama_token> llama_sampling_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector<llama_token> & draft);
+
+std::vector<llama_token> llama_sampling_sample_and_accept_n(struct llama_sampling_context * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const std::vector<llama_token> & draft);
Original file line number	Diff line number	Diff line change
`@@ -76,6 +76,7 @@ add_library(${TARGET} STATIC`
`76`	`76`	`minja.hpp`
`77`	`77`	`ngram-cache.h`
`78`	`78`	`ngram-cache.cpp`
	`79`	`+ speculative.cpp`
`79`	`80`	`)`
`80`	`81`
`81`	`82`	`if (BUILD_SHARED_LIBS)`