llama-graph: avoid expand_forward for fusion (ggml-org#17633)

am17an · web-flow · commit 6eea6669125a · 2025-12-01T11:12:48.000+02:00
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3274,14 +3274,14 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                         GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name);
                     }
                 }
-                prev_i = i;
 
 #ifdef GGML_CUDA_DEBUG
                 const int nodes_fused = i - prev_i - 1;
                 if (nodes_fused > 0) {
                     GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
                 }
 #endif
+                prev_i = i;
 
                 if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
                     continue;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -810,9 +810,6 @@ ggml_tensor * llm_graph_context::build_ffn(
             GGML_ABORT("fatal error");
     }
 
-    //expand here so that we can fuse ffn gate
-    ggml_build_forward_expand(gf, cur);
-
     if (gate && type_gate == LLM_FFN_PAR) {
         cur = ggml_mul(ctx0, cur, tmp);
         cb(cur, "ffn_gate_par", il);
@@ -1093,9 +1090,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
             GGML_ABORT("fatal error");
     }
 
-    //expand here so that we can fuse ffn gate
-    ggml_build_forward_expand(gf, cur);
-
     experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
     cb(experts, "ffn_moe_down", il);
 

Original file line number	Diff line number	Diff line change
`@@ -3274,14 +3274,14 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx`
`3274`	`3274`	`GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name);`
`3275`	`3275`	`}`
`3276`	`3276`	`}`
`3277`		`- prev_i = i;`
`3278`	`3277`
`3279`	`3278`	`#ifdef GGML_CUDA_DEBUG`
`3280`	`3279`	`const int nodes_fused = i - prev_i - 1;`
`3281`	`3280`	`if (nodes_fused > 0) {`
`3282`	`3281`	`GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);`
`3283`	`3282`	`}`
`3284`	`3283`	`#endif`
	`3284`	`+ prev_i = i;`
`3285`	`3285`
`3286`	`3286`	`if (ggml_is_empty(node) \|\| node->op == GGML_OP_RESHAPE \|\| node->op == GGML_OP_TRANSPOSE \|\| node->op == GGML_OP_VIEW \|\| node->op == GGML_OP_PERMUTE \|\| node->op == GGML_OP_NONE) {`
`3287`	`3287`	`continue;`