-
Notifications
You must be signed in to change notification settings - Fork 162
Open
Description
What happened?
There is a Segfault with spec. decoding for a sufficiently large prompt ( ' pre /opt/GGUF-Tool-Suite/GGUF-Tool-Suite/quant_assign.py | mods -m g "explain the code"' ).
/opt/ik_llama.cpp/ik_llama.cpp/build/bin/llama-server --model /opt/DevQuasar/ai-sage.GigaChat3-702B-A36B-preview-bf16-GGUF/Q5_K_M/ai-sage.GigaChat3-702B-A36B-preview-bf16.Q5_K_M-00001-of-00035.gguf --model-draft /opt/ubergarm/GigaChat3-10B-A1.8B/IQ4_KSS/GigaChat3-10B-A1.8B-IQ4_KSS.gguf -b 4096 -ub 4096 --split-mode layer --tensor-split 0,1 --main-gpu 1 --ctx-size 32768 --ctx-size-draft 32768 --draft-max 16 --draft-min 1 -ctkd f16 -ctvd f16 --gpu-layers-draft 99 --alias ubergarm/ai-sage.GigaChat3-702B-A36B-preview-bf16-GGUF --mlock --temp 0.7 --top-k 0 --top-p 0.95 --min-p 0.1 --repeat-penalty 1.1 -ctk f16 -ctv f16 -amb 512 --override-tensor exps=CPU --n-gpu-layers 99 --threads 32 --host 0.0.0.0 --port 8080 --log-enable --logdir /var/log/ --jinja --special --verbosity 2 --verbose-prompt --reasoning-format auto --sql-save-file /opt/DevQuasar/ai-sage.GigaChat3-702B-A36B-preview-bf16-GGUF/Q5_K_M/db.sqlite3 --prompt-cache /root/.cache/ik_llama.cpp/prompt-cache.bin --prompt-cache-all --slot-save-path /root/.cache/ik_llama.cpp/slot.bin --lookup-cache-dynamic /root/.cache/ik_llama.cpp/slot.bin --keep -1 --slot-prompt-similarity 0.35 --metrics -cuda fusion=1
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
VERB [ send] send new result | tid="140736909529088" timestamp=1764051832 id_task=0
VERB [ send] queue_results.push_back | tid="140736909529088" timestamp=1764051832 id_task=0
VERB [ process_token] next token | tid="140736909529088" timestamp=1764051832 id_slot=0 id_task=0 token=1153 token_text="The" has_next_token=true n_remain=-1 n_decoded=1 stopped_eos=false stopped_word=false stopped_limit=false stopping_word=""
VERB [ update_slots] max possible draft | tid="140736909529088" timestamp=1764051832 id_slot=0 n_draft_max=16
VERB [ server_sent_event] data stream, to_send: %s | ="data: {\"choices\":[{\"finish_reason\":null,\"index\":0,\"delta\":{\"role\":\"assistant\",\"content\":null}}],\"created\":1764051832,\"id\":\"chatcmpl-sb2s46brcMZULXPaxlO6sI1tXFL3YTrQ\",\"model\":\"\",\"object\":\"chat.completion.chunk\",\"usage\":{\"completion_tokens\":1,\"prompt_tokens\":23265,\"total_tokens\":23266}}\n\n"
VERB [ server_sent_event] data stream, to_send: %s | ="data: {\"choices\":[{\"finish_reason\":null,\"index\":0,\"delta\":{\"content\":\"The\"}}],\"created\":1764051832,\"id\":\"chatcmpl-sb2s46brcMZULXPaxlO6sI1tXFL3YTrQ\",\"model\":\"\",\"object\":\"chat.completion.chunk\",\"usage\":{\"completion_tokens\":1,\"prompt_tokens\":23265,\"total_tokens\":23266}}\n\n"
Thread 1 "llama-server" received signal SIGSEGV, Segmentation fault.
0x00005555557338ea in llama_batch_add(llama_batch&, int, int, std::vector<int, std::allocator<int> > const&, bool) ()
(cuda-gdb) bt full
#0 0x00005555557338ea in llama_batch_add(llama_batch&, int, int, std::vector<int, std::allocator<int> > const&, bool) ()
No symbol table info available.
#1 0x0000555555792693 in llama_speculative_gen_draft(llama_speculative*, llama_speculative_params, std::vector<int, std::allocator<int> > const&, int) ()
No symbol table info available.
#2 0x000055555565baab in server_context::update_slots() ()
No symbol table info available.
#3 0x000055555564343f in server_queue::start_loop() ()
No symbol table info available.
#4 0x00005555555b01b6 in main ()
No symbol table info available.
[EDIT]:
ggml_backend_sched_alloc_splits: failed to allocate graph, reserving (backend_ids_changed = 1)
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
check_node_graph_compatibility_and_refresh_copy_ops(CUDA0#blk.18.ffn_down_exps.weight#0): disabling CUDA graphs due to unsupported node type 4096 8
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
check_node_graph_compatibility_and_refresh_copy_ops(CUDA0#blk.19.ffn_down_exps.weight#0): disabling CUDA graphs due to unsupported node type 4096 8
warning: Cuda Driver error detected: CUDA Stream does not belong to the expected context
Thread 1 "llama-server" received signal SIGSEGV, Segmentation fault.
0x000055555586ba3a in llama_batch_add (batch=..., id=684, pos=2048, seq_ids=..., logits=false)
at /opt/ik_llama.cpp/ik_llama.cpp/common/common.cpp:3409
3409 batch.seq_id[batch.n_tokens][i] = seq_ids[i];
Name and Version
/opt/ik_llama.cpp/ik_llama.cpp/build/bin/llama-server --version
version: 4025 (36c2670)
built with cc (Debian 15.2.0-7) 15.2.0 for x86_64-linux-gnu
What operating system are you seeing the problem on?
No response
Relevant log output
bt full:
at /opt/ik_llama.cpp/ik_llama.cpp/common/speculative.cpp:279
i = 2048
batch = @0x7fe1bad87ed8: {n_tokens = 2048, token = 0x7feecfaeb000, embd = 0x0, pos = 0x7fe1ba45a000, n_seq_id = 0x7fe1ba4d9000,
seq_id = 0x7fe1babb58c0, logits = 0x7ffc1dbe2800 "", all_pos_0 = 0, all_pos_1 = 0, all_seq_id = 0}
ctx_tgt = @0x7fe1bad87ec0: 0x7fff8318d000
ctx_dft = @0x7fe1bad87ec8: 0x7fff6e956800
smpl = @0x7fe1bad87ed0: 0x7ffc1db69800
prompt_dft = @0x7fe1bad87f20: {<std::_Vector_base<int, std::allocator<int> >> = {
_M_impl = {<std::allocator<int>> = {<std::__new_allocator<int>> = {<No data fields>}, <No data fields>}, <std::_Vector_base<int, std::allocator<int> >::_Vector_impl_data> = {_M_start = 0x7fff6bbab000, _M_finish = 0x7fff6bbad000,
_M_end_of_storage = 0x7fff6bbad000}, <No data fields>}}, <No data fields>}
reuse_i = 0
reuse_n = 0
n_ctx = 32752
prompt_tgt_draft_model = {<std::_Vector_base<int, std::allocator<int> >> = {
_M_impl = {<std::allocator<int>> = {<std::__new_allocator<int>> = {<No data fields>}, <No data fields>}, <std::_Vector_base<int, std::allocator<int> >::_Vector_impl_data> = {_M_start = 0x0, _M_finish = 0x0,
_M_end_of_storage = 0x0}, <No data fields>}}, <No data fields>}
__func__ = "llama_speculative_gen_draft"
prompt_tgt = @0x7fffffff6e10: {<std::_Vector_base<int, std::allocator<int> >> = {
_M_impl = {<std::allocator<int>> = {<std::__new_allocator<int>> = {<No data fields>}, <No data fields>}, <std::_Vector_base<int, std::allocator<int> >::_Vector_impl_data> = {_M_start = 0x7f843a183580, _M_finish = 0x7f843a19a104,
_M_end_of_storage = 0x7f843a19a104}, <No data fields>}}, <No data fields>}
i_start = 0
result = {<std::_Vector_base<int, std::allocator<int> >> = {
_M_impl = {<std::allocator<int>> = {<std::__new_allocator<int>> = {<No data fields>}, <No data fields>}, <std::_Vector_base<int, std::allocator<int> >::_Vector_impl_data> = {_M_start = 0x7fabf39ad5c0, _M_finish = 0x7fabf39ad5c0,
_M_end_of_storage = 0x7fabf39ad600}, <No data fields>}}, <No data fields>}
n_past = 21845Metadata
Metadata
Assignees
Labels
No labels