diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 8182a9adf53..08ffb1e37b4 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -928,135 +928,81 @@ bool llama_model_loader::load_all_data( std::vector> read_buf; std::vector>> validation_result; - // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. - // NVMe raid configurations might require more / larger buffers. constexpr size_t n_buffers = 4; constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB std::vector host_buffers; std::vector events; std::vector host_ptrs; - size_t buffer_idx = 0; // buffer to use for async loads + size_t buffer_idx = 0; ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t { - if (use_mmap || check_tensors) { - return nullptr; - } - // When not using mmaped io use async uploads from pinned memory to GPU memory. - // First determine if the backend supports the necessary features for async uploads. + if (use_mmap || check_tensors) { return nullptr; } auto * buf = bufs.count(0) ? bufs.at(0) : nullptr; - if (!buf) { - LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", func); - return nullptr; - } - + if (!buf) { LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", func); return nullptr; } auto * buft = ggml_backend_buffer_get_type(buf); auto * dev = ggml_backend_buft_get_device(buft); - if (!dev) { - LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", func, - ggml_backend_buft_name(buft)); - return nullptr; - } - - if (buft != ggml_backend_dev_buffer_type(dev)) { - LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", func, - ggml_backend_buft_name(buft), ggml_backend_dev_name(dev)); - return nullptr; - } - + if (!dev) { LLAMA_LOG_DEBUG("%s: no device found\n", func); return nullptr; } + if (buft != ggml_backend_dev_buffer_type(dev)) { return nullptr; } ggml_backend_dev_props props; ggml_backend_dev_get_props(dev, &props); - if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) { - LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", func, - ggml_backend_dev_name(dev)); - return nullptr; - } - + if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) { return nullptr; } auto * host_buft = ggml_backend_dev_host_buffer_type(dev); - if (!host_buft) { - LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", func, - ggml_backend_dev_name(dev)); - return nullptr; - } - - // If the backend is supported, create pinned memory buffers and events for synchronisation. + if (!host_buft) { return nullptr; } for (size_t idx = 0; idx < n_buffers; ++idx) { auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); - if (!buf) { - LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, - ggml_backend_dev_name(dev)); - return nullptr; - } - + if (!buf) { return nullptr; } host_buffers.emplace_back(buf); host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf)); - auto * event = ggml_backend_event_new(dev); - if (!event) { - LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", func, - ggml_backend_dev_name(dev)); - return nullptr; - } - + if (!event) { return nullptr; } events.emplace_back(event); } - ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); - if (!backend) { - LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", func, - ggml_backend_dev_name(dev)); - return nullptr; - } - + if (!backend) { return nullptr; } return backend; }(__func__); if (upload_backend) { - LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__, - ggml_backend_dev_name(ggml_backend_get_device(upload_backend)), - ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))), - ggml_backend_name(upload_backend)); + LLAMA_LOG_DEBUG("%s: using async uploads\n", __func__); } - + int tensor_count = 0; for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { + tensor_count++; const auto * weight = get_weight(ggml_get_name(cur)); - if (weight == nullptr) { - // this can happen with split experts models - continue; - } + if (weight == nullptr) { continue; } if (progress_callback) { - if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { - return false; - } + if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { return false; } } size_t n_size = ggml_nbytes(cur); - if (use_mmap) { + if (weight->idx >= mappings.size()) { + throw std::runtime_error(format("tensor '%s' has invalid file index %d", ggml_get_name(cur), weight->idx)); + } const auto & mapping = mappings.at(weight->idx); - ggml_backend_buffer_t buf_mmap = nullptr; - if (bufs.count(weight->idx)) { - buf_mmap = bufs.at(weight->idx); + if (weight->offs + n_size > mapping->size()) { + throw std::runtime_error(format("tensor '%s' is out of bounds", ggml_get_name(cur))); } + ggml_backend_buffer_t buf_mmap = nullptr; + if (bufs.count(weight->idx)) { buf_mmap = bufs.at(weight->idx); } uint8_t * data = (uint8_t *) mapping->addr() + weight->offs; - if (check_tensors) { validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); })); } - - GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated + GGML_ASSERT(buf_mmap || cur->data); if (buf_mmap && cur->data == nullptr) { ggml_backend_tensor_alloc(buf_mmap, cur, data); - if (lmlocks) { - const auto & lmlock = lmlocks->at(weight->idx); - lmlock->grow_to(weight->offs + n_size); + if (lmlocks && weight->idx < lmlocks->size()) { + lmlocks->at(weight->idx)->grow_to(weight->offs + n_size); + } + if (weight->idx < mmaps_used.size()) { + auto & mmap_used = mmaps_used[weight->idx]; + mmap_used.first = std::min(mmap_used.first, weight->offs); + mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); } - - auto & mmap_used = mmaps_used[weight->idx]; - mmap_used.first = std::min(mmap_used.first, weight->offs); - mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); } else { ggml_backend_tensor_set(cur, data, 0, n_size); } @@ -1071,20 +1017,15 @@ bool llama_model_loader::load_all_data( })); } } else { - // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. if (upload_backend) { file->seek(weight->offs, SEEK_SET); - size_t bytes_read = 0; - while (bytes_read < n_size) { size_t read_iteration = std::min(buffer_size, n_size - bytes_read); - ggml_backend_event_synchronize(events[buffer_idx]); file->read_raw(host_ptrs[buffer_idx], read_iteration); ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); ggml_backend_event_record(events[buffer_idx], upload_backend); - bytes_read += read_iteration; ++buffer_idx; buffer_idx %= n_buffers; @@ -1102,9 +1043,10 @@ bool llama_model_loader::load_all_data( } size_done += n_size; + if (tensor_count % 100 == 0) { + LLAMA_LOG_INFO("%s: loaded %d tensors\n", __func__, tensor_count); + } } - - // free temporary resources used for async uploads for (auto * event : events) { ggml_backend_event_synchronize(event); ggml_backend_event_free(event); @@ -1114,7 +1056,6 @@ bool llama_model_loader::load_all_data( } ggml_backend_free(upload_backend); - // check validation results bool validation_failed = false; for (auto & future : validation_result) { auto result = future.get(); @@ -1123,13 +1064,8 @@ bool llama_model_loader::load_all_data( validation_failed = true; } } - if (validation_failed) { - throw std::runtime_error("found tensors with invalid data"); - } - - // check if this is the last call and do final cleanup + if (validation_failed) { throw std::runtime_error("found tensors with invalid data"); } if (size_done >= size_data) { - // unmap offloaded tensors and metadata if (use_mmap) { for (uint32_t idx = 0; idx < mappings.size(); idx++) { const auto & mmap_used = mmaps_used.at(idx); @@ -1141,8 +1077,6 @@ bool llama_model_loader::load_all_data( } } if (progress_callback) { - // Even though the model is done loading, we still honor - // cancellation since we need to free allocations. return progress_callback(1.0f, progress_callback_user_data); } } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 312e347a4c7..be2d790418c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -13898,6 +13898,11 @@ struct llm_build_deepseek3_2 : public llm_graph_context { // Apply sparse attention if available, otherwise use regular attention if (use_sparse_attention) {{ + // Guard: Only use sparse attention if inp_attn and mctx are valid + if (!inp_attn || !inp_attn->mctx) { + use_sparse_attention = false; + goto regular_attention_mla; + } const auto * mctx_cur = inp_attn->mctx; ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, Kcur, inp_attn->get_k_idxs(), il)); ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, Vcur, inp_attn->get_v_idxs(), il)); @@ -13969,6 +13974,7 @@ struct llm_build_deepseek3_2 : public llm_graph_context { LLAMA_LOG_DEBUG("DeepSeek V3.2: Using sparse attention with top-%d tokens for layer %d\n", (int)top_k, il); } else { + regular_attention_mla: // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group) cur = build_attn(inp_attn, model.layers[il].wo, NULL, @@ -14006,6 +14012,11 @@ struct llm_build_deepseek3_2 : public llm_graph_context { // Apply sparse attention if available, otherwise use regular attention if (use_sparse_attention) {{ + // Guard: Only use sparse attention if inp_attn and mctx are valid + if (!inp_attn || !inp_attn->mctx) { + use_sparse_attention = false; + goto regular_attention_mha; + } const auto * mctx_cur = inp_attn->mctx; ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, Kcur, inp_attn->get_k_idxs(), il)); ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, Vcur, inp_attn->get_v_idxs(), il)); @@ -14064,6 +14075,7 @@ struct llm_build_deepseek3_2 : public llm_graph_context { LLAMA_LOG_DEBUG("DeepSeek V3.2: Using sparse attention with top-%d tokens for layer %d\n", (int)top_k, il); } else { + regular_attention_mha: // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups) cur = build_attn(inp_attn, model.layers[il].wo, NULL, diff --git a/src/llama-sparse-mla-fwd.cpp b/src/llama-sparse-mla-fwd.cpp index 2a3d79ac27b..6ab65382e09 100644 --- a/src/llama-sparse-mla-fwd.cpp +++ b/src/llama-sparse-mla-fwd.cpp @@ -61,7 +61,7 @@ using std::function; const int64_t T = q_cur->ne[2]; // Fused decode path: use custom CUDA op when T == 1 const char * env_fused_dec = getenv("LLAMA_SPARSE_MLA_FUSED_DECODE"); - if (T == 1 && (env_fused_dec == nullptr || atoi(env_fused_dec) != 0)) { + if (T == 1 && (env_fused_dec != nullptr && atoi(env_fused_dec) != 0)) { // Build q_t [Dq, Hq] ggml_tensor * q_cur_cont2 = ggml_cont(ctx, q_cur); ggml_tensor * q_all_2d2 = ggml_reshape_2d(ctx, q_cur_cont2, Dq, Hq*T); diff --git a/src/llama-sparse-topk.cpp b/src/llama-sparse-topk.cpp index 1f5842b94a3..d066eeb30c7 100644 --- a/src/llama-sparse-topk.cpp +++ b/src/llama-sparse-topk.cpp @@ -669,33 +669,12 @@ ggml_tensor * sparse_attn_topk::select_topk_tokens_indexer_kvaware( // Compute top-k indices via CUDA radix selection const int64_t k_tile = std::min(k, scores_clamped->ne[0]); ggml_tensor * topk_tc = nullptr; - if (have_windows && win_ends) { - // slice starts/ends for this tile [t0, t0+Tc) - ggml_tensor * starts_tile = nullptr; - ggml_tensor * ends_tile = nullptr; - if (win_starts) { - size_t off_s = (size_t)t0 * win_starts->nb[0]; - starts_tile = ggml_view_1d(ctx, win_starts, Tc, off_s); - starts_tile = ggml_cont(ctx, starts_tile); - } - if (win_ends) { - size_t off_e = (size_t)t0 * win_ends->nb[0]; - ends_tile = ggml_view_1d(ctx, win_ends, Tc, off_e); - ends_tile = ggml_cont(ctx, ends_tile); - } - if (dbg) { - printf("[TOPK] using start and end\n"); - fflush(stdout); - } - topk_tc = ggml_sparse_topk_radix_ex(ctx, scores_clamped, (int)k_tile, starts_tile, ends_tile); - } else { - if (dbg) { - printf("[TOPK] not using start and end, have_windows=%s win_ends=%s\n", - have_windows ? "true" : "false", win_ends ? "true" : "false"); - fflush(stdout); - } - topk_tc = ggml_sparse_topk_radix(ctx, scores_clamped, (int)k_tile); - } +#if defined(__APPLE__) + // Force CPU fallback for top-k since Metal backend is missing SPARSE_TOPK_RADIX + topk_tc = sparse_attn_topk::topk_radix_indices(ctx, scores_clamped, (int)k_tile); +#else + topk_tc = ggml_sparse_topk_radix(ctx, scores_clamped, (int)k_tile); +#endif if (dbg && t0 == 0) { cb(topk_tc, "idxkv_topk_radix", -1); int64_t kk = std::min(k_tile, (int64_t)16);