Skip to content

Commit 6eea666

Browse files
authored
llama-graph: avoid expand_forward for fusion (ggml-org#17633)
1 parent ff90508 commit 6eea666

File tree

2 files changed

+1
-7
lines changed

2 files changed

+1
-7
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3274,14 +3274,14 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
32743274
GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name);
32753275
}
32763276
}
3277-
prev_i = i;
32783277

32793278
#ifdef GGML_CUDA_DEBUG
32803279
const int nodes_fused = i - prev_i - 1;
32813280
if (nodes_fused > 0) {
32823281
GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
32833282
}
32843283
#endif
3284+
prev_i = i;
32853285

32863286
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
32873287
continue;

src/llama-graph.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -810,9 +810,6 @@ ggml_tensor * llm_graph_context::build_ffn(
810810
GGML_ABORT("fatal error");
811811
}
812812

813-
//expand here so that we can fuse ffn gate
814-
ggml_build_forward_expand(gf, cur);
815-
816813
if (gate && type_gate == LLM_FFN_PAR) {
817814
cur = ggml_mul(ctx0, cur, tmp);
818815
cb(cur, "ffn_gate_par", il);
@@ -1093,9 +1090,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
10931090
GGML_ABORT("fatal error");
10941091
}
10951092

1096-
//expand here so that we can fuse ffn gate
1097-
ggml_build_forward_expand(gf, cur);
1098-
10991093
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
11001094
cb(experts, "ffn_moe_down", il);
11011095

0 commit comments

Comments
 (0)