From 12d5c098e66ed9fd3982b51b09f13c9ae537315c Mon Sep 17 00:00:00 2001 From: Wanglongzhi2001 <583087864@qq.com> Date: Thu, 20 Nov 2025 13:56:20 +0800 Subject: [PATCH 1/5] [Models] Add forward_meta to moe models' forward function --- .../layers/moe/fused_moe_backend_base.py | 4 +++- fastdeploy/model_executor/layers/moe/moe.py | 7 ++++--- .../model_executor/models/deepseek_v3.py | 6 +++--- .../model_executor/models/ernie4_5_moe.py | 21 ++++++++++++++----- .../model_executor/models/ernie4_5_mtp.py | 4 ++-- .../models/ernie4_5_vl/ernie4_5_vl_moe.py | 18 ++++++++-------- fastdeploy/model_executor/models/glm4_moe.py | 11 ++++++---- fastdeploy/model_executor/models/gpt_oss.py | 6 +++--- fastdeploy/model_executor/models/qwen2.py | 4 ++-- fastdeploy/model_executor/models/qwen3moe.py | 10 ++++----- fastdeploy/spec_decode/mtp.py | 2 +- 11 files changed, 55 insertions(+), 38 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py index 03191066713..4aba9982c4a 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py @@ -19,6 +19,7 @@ import paddle from paddle import nn +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.utils import ( TensorTracker, default_weight_loader, @@ -198,13 +199,14 @@ def apply( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, + forward_meta: ForwardMeta, ) -> paddle.Tensor: """ Paddle Cutlass compute Fused MoE. """ if layer.ep_size > 1: is_moe_start_layer = layer.layer_idx == layer.fd_config.model_config.moe_layer_start_index - if layer.fd_config.model_config.moe_phase.phase == "prefill": + if forward_meta.moe_phase.phase == "prefill": if layer.fd_config.scheduler_config.splitwise_role == "mixed" and is_moe_start_layer: self.ep_prefill_runner.clean_low_latency_buffer() return self.apply_ep_prefill(layer, x, gate) diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index ede87972185..d78167eb48a 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -21,6 +21,7 @@ from paddleformers.utils.log import logger from fastdeploy import envs +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.layers.utils import get_tensor from fastdeploy.model_executor.utils import h2d_copy, slice_fn from fastdeploy.platforms import current_platform @@ -593,7 +594,7 @@ def forward_split_allgather(self, x: paddle.Tensor, gate: nn.Layer): out = multi_outs[:token_num, :] return out - def forward(self, x: paddle.Tensor, gate: nn.Layer): + def forward(self, x: paddle.Tensor, gate: nn.Layer, forward_meta: ForwardMeta): """ Defines the forward computation of the moe layer. @@ -611,7 +612,7 @@ def forward(self, x: paddle.Tensor, gate: nn.Layer): and (not self.fd_config.parallel_config.use_sequence_parallel_moe) and token_num >= self.tp_size ): - out = self.forward_split_allgather(x, gate) + out = self.forward_split_allgather(x, gate, forward_meta) else: - out = self.quant_method.apply(self, x, gate) + out = self.quant_method.apply(self, x, gate, forward_meta) return out diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py index 04fa0abd09b..b2abd2c974d 100644 --- a/fastdeploy/model_executor/models/deepseek_v3.py +++ b/fastdeploy/model_executor/models/deepseek_v3.py @@ -104,7 +104,7 @@ def load_state_dict(self, state_dict): self.up_gate_proj.load_state_dict(state_dict) self.down_proj.load_state_dict(state_dict) - def forward(self, x): + def forward(self, x, forward_meta): """ """ gate_up_out = self.up_gate_proj(x) act_out = self.act_fn(gate_up_out) @@ -187,7 +187,7 @@ def load_state_dict(self, state_dict): self.experts.load_state_dict(state_dict) self.shared_experts.load_state_dict(state_dict) - def forward(self, hidden_states: paddle.Tensor): + def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta): """ """ shared_experts_out = self.shared_experts(hidden_states) moe_out = self.experts(hidden_states, self.gate) @@ -514,7 +514,7 @@ def forward( hidden_states = self.self_attn(forward_meta, hidden_states, position_ids, mask_encoder_batch) hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - hidden_states = self.mlp(hidden_states) + hidden_states = self.mlp(hidden_states, forward_meta) return hidden_states, residual diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index 75947590be8..3ee414b220c 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -95,7 +95,7 @@ def load_state_dict(self, state_dict): self.up_gate_proj.load_state_dict(state_dict) self.down_proj.load_state_dict(state_dict) - def forward(self, hidden_states: paddle.Tensor): + def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta): gate_up_out = self.up_gate_proj(hidden_states) act_out = self.act_fn(gate_up_out) down_out = self.down_proj(act_out) @@ -213,8 +213,16 @@ def load_state_dict(self, state_dict): def update_state_dict(self, state_dict): self.fused_moe.load_state_dict(state_dict, True) - def forward(self, hidden_states: paddle.Tensor): - out = self.experts(hidden_states, self.gate) + def forward( + self, + hidden_states: paddle.Tensor, + forward_meta: ForwardMeta, + ): + out = self.experts( + x=hidden_states, + gate=self.gate, + forward_meta=forward_meta, + ) if self.num_shared_experts > 0: s_x = self.shared_experts(hidden_states) out = out + s_x @@ -344,7 +352,10 @@ def forward( residual, ) - hidden_states = self.mlp(hidden_states) + hidden_states = self.mlp( + forward_meta=forward_meta, + hidden_states=hidden_states, + ) return hidden_states, residual @@ -621,7 +632,7 @@ def empty_input_forward(self): self.fd_config.model_config.moe_layer_start_index, self.fd_config.model_config.num_hidden_layers, ): - self.ernie.layers[i].mlp.experts(fake_hidden_states, self.ernie.layers[i].mlp.gate) + self.ernie.layers[i].mlp.experts(fake_hidden_states, self.ernie.layers[i].mlp.gate, self.forward_meta) def forward( self, diff --git a/fastdeploy/model_executor/models/ernie4_5_mtp.py b/fastdeploy/model_executor/models/ernie4_5_mtp.py index 0aedb040062..2d57ed504cb 100644 --- a/fastdeploy/model_executor/models/ernie4_5_mtp.py +++ b/fastdeploy/model_executor/models/ernie4_5_mtp.py @@ -436,7 +436,7 @@ def compute_logits(self, hidden_states: paddle.Tensor): return logits - def empty_input_forward(self): + def empty_input_forward(self, forward_meta): """ empty_input_forward """ @@ -448,7 +448,7 @@ def empty_input_forward(self): self.fd_config.model_config.moe_layer_start_index, self.fd_config.model_config.num_hidden_layers, ): - self.ernie.layers[i].mlp.fused_moe(fake_hidden_states) + self.ernie.layers[i].mlp.fused_moe(hidden_states=fake_hidden_states, forward_meta=forward_meta) def forward( self, diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index a291db0e9a5..6f518582bd9 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -169,8 +169,8 @@ def __init__( model_format="", ) - def forward(self, hidden_states: paddle.Tensor): - out = self.experts(hidden_states, self.gate) + def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta): + out = self.experts(hidden_states, self.gate, forward_meta) return out def load_state_dict(self, state_dict): @@ -269,7 +269,7 @@ def load_state_dict(self, state_dict): if self.num_shared_experts > 0: self.shared_experts.load_state_dict(state_dict) - def forward(self, hidden_states: paddle.Tensor, vl_moe_meta: VLMoEMeta): + def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta, vl_moe_meta: VLMoEMeta): if self.num_shared_experts > 0: shared_experts_out = self.shared_experts(hidden_states) hidden_states, text_input, image_input = text_image_gather_scatter( @@ -281,8 +281,8 @@ def forward(self, hidden_states: paddle.Tensor, vl_moe_meta: VLMoEMeta): vl_moe_meta.image_index, True, ) - text_out = self.text_fused_moe(text_input) - image_out = self.image_fused_moe(image_input) + text_out = self.text_fused_moe(text_input, forward_meta) + image_out = self.image_fused_moe(image_input, forward_meta) hidden_states, _, _ = text_image_gather_scatter( hidden_states, text_out, @@ -388,9 +388,9 @@ def forward( hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) if isinstance(self.mlp, Ernie4_5_VLMoE): - hidden_states = self.mlp(hidden_states, vl_moe_meta) + hidden_states = self.mlp(hidden_states, forward_meta, vl_moe_meta) else: - hidden_states = self.mlp(hidden_states) + hidden_states = self.mlp(hidden_states, forward_meta) return hidden_states, residual @@ -757,8 +757,8 @@ def empty_input_forward(self): self.fd_config.model_config.moe_layer_start_index, self.fd_config.model_config.num_hidden_layers, ): - self.ernie.layers[i].mlp.text_fused_moe(fake_hidden_states) - self.ernie.layers[i].mlp.image_fused_moe(fake_hidden_states) + self.ernie.layers[i].mlp.text_fused_moe(fake_hidden_states, self.forward_meta) + self.ernie.layers[i].mlp.image_fused_moe(fake_hidden_states, self.forward_meta) def get_input_embeddings( self, diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py index 8850ce81243..c18762d49d4 100644 --- a/fastdeploy/model_executor/models/glm4_moe.py +++ b/fastdeploy/model_executor/models/glm4_moe.py @@ -85,7 +85,7 @@ def __init__( act_method=fd_config.model_config.hidden_act, ) - def forward(self, x): + def forward(self, x, forward_meta): """ """ gate_up_out = self.up_gate_proj(x) act_out = self.act_fn(gate_up_out) @@ -161,9 +161,9 @@ def __init__( reduce_results=False, ) - def forward(self, x): + def forward(self, x, forward_meta): shared_experts_out = self.shared_experts(x) - out = self.experts(x, self.gate) + out = self.experts(x, self.gate, forward_meta) out = out + shared_experts_out # We do to TP all reduce after the sum of experts. if self.tensor_parallel_size > 1: @@ -306,7 +306,10 @@ def forward( # Fully Connected hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - hidden_states = self.mlp(hidden_states) + hidden_states = self.mlp( + hidden_states, + forward_meta, + ) return hidden_states, residual diff --git a/fastdeploy/model_executor/models/gpt_oss.py b/fastdeploy/model_executor/models/gpt_oss.py index e951fff92f5..682c9f5f1ec 100644 --- a/fastdeploy/model_executor/models/gpt_oss.py +++ b/fastdeploy/model_executor/models/gpt_oss.py @@ -124,8 +124,8 @@ def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str = ""): model_format="", ) - def forward(self, hidden_states: paddle.Tensor): - expert_output = self.experts(hidden_states, self.router) + def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta): + expert_output = self.experts(hidden_states, self.router, forward_meta) return expert_output @@ -173,7 +173,7 @@ def forward( # Fully Connected hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - hidden_states = self.mlp(hidden_states) + hidden_states = self.mlp(hidden_states, forward_meta) return hidden_states, residual diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py index d49f3a32705..e56423c9136 100644 --- a/fastdeploy/model_executor/models/qwen2.py +++ b/fastdeploy/model_executor/models/qwen2.py @@ -90,7 +90,7 @@ def load_state_dict(self, state_dict): self.up_gate_proj.load_state_dict(state_dict) self.down_proj.load_state_dict(state_dict) - def forward(self, x): + def forward(self, x, forward_meta): """ """ gate_up_out = self.up_gate_proj(x) act_out = self.act_fn(gate_up_out) @@ -206,7 +206,7 @@ def forward( # Fully Connected hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - hidden_states = self.mlp(hidden_states) + hidden_states = self.mlp(hidden_states, forward_meta) return hidden_states, residual diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 9537b84f22c..c16d83d8331 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -79,8 +79,8 @@ def __init__( weight_dtype="float32", ) - def forward(self, x): - return self.experts(x, self.gate) + def forward(self, x, forward_meta): + return self.experts(x, self.gate, forward_meta) def load_state_dict(self, state_dict): """ """ @@ -127,7 +127,7 @@ def load_state_dict(self, state_dict): self.up_gate_proj.load_state_dict(state_dict) self.down_proj.load_state_dict(state_dict) - def forward(self, x): + def forward(self, x, forward_meta): """ """ gate_up_out = self.up_gate_proj(x) act_out = self.act_fn(gate_up_out) @@ -206,7 +206,7 @@ def forward( # Fully Connected hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - hidden_states = self.mlp(hidden_states) + hidden_states = self.mlp(hidden_states, forward_meta) return hidden_states, residual @@ -430,7 +430,7 @@ def empty_input_forward(self): self.fd_config.model_config.moe_layer_start_index, self.fd_config.model_config.num_hidden_layers, ): - self.model.layers[i].mlp.experts(fake_hidden_states, self.model.layers[i].mlp.gate) + self.model.layers[i].mlp.experts(fake_hidden_states, self.model.layers[i].mlp.gate, self.forward_meta) def forward( self, diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index 3b40c8c164f..be9f90187bd 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -904,7 +904,7 @@ def _propose(self, step_use_cudagraph: bool = False): self._get_self_hidden_states(hidden_states) else: if hasattr(self.model, "empty_input_forward"): - self.model.empty_input_forward() + self.model.empty_input_forward(self.forward_meta) def _get_self_hidden_states(self, hidden_states): target_hidden_states = eagle_get_self_hidden_states( From 0f27bd82bedc13c14d0a8c7f1d72326f4c22569b Mon Sep 17 00:00:00 2001 From: Wanglongzhi2001 <583087864@qq.com> Date: Thu, 20 Nov 2025 14:34:04 +0800 Subject: [PATCH 2/5] fix missing param --- fastdeploy/model_executor/layers/moe/moe.py | 4 ++-- fastdeploy/model_executor/models/deepseek_v3.py | 4 ++-- fastdeploy/model_executor/models/ernie4_5_moe.py | 4 ++-- .../model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index d78167eb48a..36da0819a14 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -573,7 +573,7 @@ def load_state_dict(self, state_dict, is_rearrange: bool = False): else: self.quant_method.process_loaded_weights(self, state_dict) - def forward_split_allgather(self, x: paddle.Tensor, gate: nn.Layer): + def forward_split_allgather(self, x: paddle.Tensor, gate: nn.Layer, forward_meta: ForwardMeta): """ Forward split allgather function. """ @@ -588,7 +588,7 @@ def forward_split_allgather(self, x: paddle.Tensor, gate: nn.Layer): if end_offset > token_num: end_offset = token_num part_x[: (end_offset - start_offset), :] = x[start_offset:end_offset, :] - out = self.quant_method.apply(self, part_x, gate) + out = self.quant_method.apply(self, part_x, gate, forward_meta) multi_outs = paddle.zeros([token_num_per_rank * self.tp_size, x.shape[1]], dtype=x.dtype) paddle.distributed.all_gather(multi_outs, out, self.tp_group) out = multi_outs[:token_num, :] diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py index b2abd2c974d..5fab9808dbe 100644 --- a/fastdeploy/model_executor/models/deepseek_v3.py +++ b/fastdeploy/model_executor/models/deepseek_v3.py @@ -189,8 +189,8 @@ def load_state_dict(self, state_dict): def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta): """ """ - shared_experts_out = self.shared_experts(hidden_states) - moe_out = self.experts(hidden_states, self.gate) + shared_experts_out = self.shared_experts(hidden_states, forward_meta) + moe_out = self.experts(hidden_states, self.gate, forward_meta) moe_out = moe_out + shared_experts_out # We do to TP all reduce after the sum of experts. if self.tp_size > 1: diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index 3ee414b220c..aae9998463f 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -224,7 +224,7 @@ def forward( forward_meta=forward_meta, ) if self.num_shared_experts > 0: - s_x = self.shared_experts(hidden_states) + s_x = self.shared_experts(hidden_states, forward_meta) out = out + s_x return out @@ -353,8 +353,8 @@ def forward( ) hidden_states = self.mlp( - forward_meta=forward_meta, hidden_states=hidden_states, + forward_meta=forward_meta, ) return hidden_states, residual diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index 6f518582bd9..ad4acb48bff 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -271,7 +271,7 @@ def load_state_dict(self, state_dict): def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta, vl_moe_meta: VLMoEMeta): if self.num_shared_experts > 0: - shared_experts_out = self.shared_experts(hidden_states) + shared_experts_out = self.shared_experts(hidden_states, forward_meta) hidden_states, text_input, image_input = text_image_gather_scatter( hidden_states, vl_moe_meta.text_input, From 5a5b46e442dd08b0ca27fbd6c47c16b7128287b0 Mon Sep 17 00:00:00 2001 From: Wanglongzhi2001 <583087864@qq.com> Date: Thu, 20 Nov 2025 14:58:56 +0800 Subject: [PATCH 3/5] fix --- fastdeploy/model_executor/layers/moe/moe.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index 36da0819a14..36a98e54dbe 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -573,7 +573,7 @@ def load_state_dict(self, state_dict, is_rearrange: bool = False): else: self.quant_method.process_loaded_weights(self, state_dict) - def forward_split_allgather(self, x: paddle.Tensor, gate: nn.Layer, forward_meta: ForwardMeta): + def forward_split_allgather(self, x: paddle.Tensor, gate: nn.Layer): """ Forward split allgather function. """ @@ -588,7 +588,7 @@ def forward_split_allgather(self, x: paddle.Tensor, gate: nn.Layer, forward_meta if end_offset > token_num: end_offset = token_num part_x[: (end_offset - start_offset), :] = x[start_offset:end_offset, :] - out = self.quant_method.apply(self, part_x, gate, forward_meta) + out = self.quant_method.apply(self, part_x, gate) multi_outs = paddle.zeros([token_num_per_rank * self.tp_size, x.shape[1]], dtype=x.dtype) paddle.distributed.all_gather(multi_outs, out, self.tp_group) out = multi_outs[:token_num, :] @@ -612,7 +612,7 @@ def forward(self, x: paddle.Tensor, gate: nn.Layer, forward_meta: ForwardMeta): and (not self.fd_config.parallel_config.use_sequence_parallel_moe) and token_num >= self.tp_size ): - out = self.forward_split_allgather(x, gate, forward_meta) + out = self.forward_split_allgather(x, gate) else: - out = self.quant_method.apply(self, x, gate, forward_meta) + out = self.quant_method.apply(self, x, gate) return out From 10137ec2a28994e1f7cc4b62d004f867b036a2d1 Mon Sep 17 00:00:00 2001 From: Wanglongzhi2001 <583087864@qq.com> Date: Thu, 20 Nov 2025 15:00:47 +0800 Subject: [PATCH 4/5] fix --- .../model_executor/layers/moe/fused_moe_backend_base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py index 4aba9982c4a..03191066713 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py @@ -19,7 +19,6 @@ import paddle from paddle import nn -from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.utils import ( TensorTracker, default_weight_loader, @@ -199,14 +198,13 @@ def apply( layer: nn.Layer, x: paddle.Tensor, gate: nn.Layer, - forward_meta: ForwardMeta, ) -> paddle.Tensor: """ Paddle Cutlass compute Fused MoE. """ if layer.ep_size > 1: is_moe_start_layer = layer.layer_idx == layer.fd_config.model_config.moe_layer_start_index - if forward_meta.moe_phase.phase == "prefill": + if layer.fd_config.model_config.moe_phase.phase == "prefill": if layer.fd_config.scheduler_config.splitwise_role == "mixed" and is_moe_start_layer: self.ep_prefill_runner.clean_low_latency_buffer() return self.apply_ep_prefill(layer, x, gate) From c2712a623627a4720cd42b37190bc4102b070820 Mon Sep 17 00:00:00 2001 From: Wanglongzhi2001 <583087864@qq.com> Date: Wed, 26 Nov 2025 10:39:19 +0800 Subject: [PATCH 5/5] fix forward_meta --- fastdeploy/model_executor/models/ernie4_5_moe.py | 4 ++-- .../model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py | 6 +++--- fastdeploy/model_executor/models/qwen3moe.py | 4 ++-- fastdeploy/worker/gcu_model_runner.py | 6 +++--- fastdeploy/worker/gpu_model_runner.py | 6 +++--- fastdeploy/worker/hpu_model_runner.py | 4 ++-- fastdeploy/worker/metax_model_runner.py | 6 +++--- fastdeploy/worker/xpu_model_runner.py | 6 +++--- 8 files changed, 21 insertions(+), 21 deletions(-) diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index aae9998463f..7d26764cb38 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -620,7 +620,7 @@ def compute_logits(self, hidden_states: paddle.Tensor): return logits - def empty_input_forward(self): + def empty_input_forward(self, forward_meta): """ empty_input_forward """ @@ -632,7 +632,7 @@ def empty_input_forward(self): self.fd_config.model_config.moe_layer_start_index, self.fd_config.model_config.num_hidden_layers, ): - self.ernie.layers[i].mlp.experts(fake_hidden_states, self.ernie.layers[i].mlp.gate, self.forward_meta) + self.ernie.layers[i].mlp.experts(fake_hidden_states, self.ernie.layers[i].mlp.gate, forward_meta) def forward( self, diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index ad4acb48bff..7c3685f9b22 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -745,7 +745,7 @@ def compute_logits(self, hidden_states: paddle.Tensor): return logits - def empty_input_forward(self): + def empty_input_forward(self, forward_meta): """ empty_input_forward """ @@ -757,8 +757,8 @@ def empty_input_forward(self): self.fd_config.model_config.moe_layer_start_index, self.fd_config.model_config.num_hidden_layers, ): - self.ernie.layers[i].mlp.text_fused_moe(fake_hidden_states, self.forward_meta) - self.ernie.layers[i].mlp.image_fused_moe(fake_hidden_states, self.forward_meta) + self.ernie.layers[i].mlp.text_fused_moe(fake_hidden_states, forward_meta) + self.ernie.layers[i].mlp.image_fused_moe(fake_hidden_states, forward_meta) def get_input_embeddings( self, diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index c16d83d8331..5f67ba75b99 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -418,7 +418,7 @@ def compute_logits(self, hidden_states: paddle.Tensor): return logits - def empty_input_forward(self): + def empty_input_forward(self, forward_meta): """ empty_input_forward """ @@ -430,7 +430,7 @@ def empty_input_forward(self): self.fd_config.model_config.moe_layer_start_index, self.fd_config.model_config.num_hidden_layers, ): - self.model.layers[i].mlp.experts(fake_hidden_states, self.model.layers[i].mlp.gate, self.forward_meta) + self.model.layers[i].mlp.experts(fake_hidden_states, self.model.layers[i].mlp.gate, forward_meta) def forward( self, diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index 3444cc7dd1f..6bd8da02b24 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -971,7 +971,7 @@ class at the server level, which is too granular for ModelRunner. # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode, # when there is data on other runner, the current runner is required to execute part of the model. if not self.not_need_stop(): - self._execute_empty_input() + self._execute_empty_input(self.forward_meta) return None # 1. Prepare inputs of model and sampler. @@ -1088,14 +1088,14 @@ class at the server level, which is too granular for ModelRunner. self.seq_lens_this_time_buffer.copy_(self.share_inputs["seq_lens_this_time"], False) return None - def _execute_empty_input(self) -> None: + def _execute_empty_input(self, forward_meta) -> None: """ In certain scenarios, such as during EP, the runner needs to execute partial modules of the model without input data. This requires the model to implement the `empty_input_forward` method. """ if hasattr(self.model, "empty_input_forward"): - self.model.empty_input_forward() + self.model.empty_input_forward(forward_meta) else: raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward") diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index e547da97df7..404367cc803 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -2087,7 +2087,7 @@ class at the server level, which is too granular for ModelRunner. # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode, # when there is data on other runner, the current runner is required to execute part of the model. if not self.not_need_stop(): - self._execute_empty_input() + self._execute_empty_input(self.forward_meta) return None # 2. Padding inputs for cuda graph @@ -2349,14 +2349,14 @@ def _pool(self, hidden_states: paddle.Tensor, num_running_requests: int) -> Opti return pooler_output - def _execute_empty_input(self) -> None: + def _execute_empty_input(self, forward_meta) -> None: """ In certain scenarios, such as during EP, the runner needs to execute partial modules of the model without input data. This requires the model to implement the `empty_input_forward` method. """ if hasattr(self.model, "empty_input_forward"): - self.model.empty_input_forward() + self.model.empty_input_forward(forward_meta) else: raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward") diff --git a/fastdeploy/worker/hpu_model_runner.py b/fastdeploy/worker/hpu_model_runner.py index b86d680f08f..b7363dfdbf3 100644 --- a/fastdeploy/worker/hpu_model_runner.py +++ b/fastdeploy/worker/hpu_model_runner.py @@ -1345,14 +1345,14 @@ class at the server level, which is too granular for ModelRunner. self.prof.step() return None - def _execute_empty_input(self) -> None: + def _execute_empty_input(self, forward_meta) -> None: """ In certain scenarios, such as during EP, the runner needs to execute partial modules of the model without input data. This requires the model to implement the `empty_input_forward` method. """ if hasattr(self.model, "empty_input_forward"): - self.model.empty_input_forward() + self.model.empty_input_forward(forward_meta) else: raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward") diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py index b346f7be6ab..3038a34fc2b 100644 --- a/fastdeploy/worker/metax_model_runner.py +++ b/fastdeploy/worker/metax_model_runner.py @@ -1812,7 +1812,7 @@ class at the server level, which is too granular for ModelRunner. # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode, # when there is data on other runner, the current runner is required to execute part of the model. if not self.not_need_stop(): - self._execute_empty_input() + self._execute_empty_input(self.forward_meta) return None # 2. Padding inputs for cuda graph @@ -1998,14 +1998,14 @@ class at the server level, which is too granular for ModelRunner. ) return None - def _execute_empty_input(self) -> None: + def _execute_empty_input(self, forward_meta) -> None: """ In certain scenarios, such as during EP, the runner needs to execute partial modules of the model without input data. This requires the model to implement the `empty_input_forward` method. """ if hasattr(self.model, "empty_input_forward"): - self.model.empty_input_forward() + self.model.empty_input_forward(forward_meta) else: raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward") diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 4ab4ee2ff3c..d337225b178 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -1159,7 +1159,7 @@ class at the server level, which is too granular for ModelRunner. # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode, # when there is data on other runner, the current runner is required to execute part of the model. if not self.not_need_stop() and not is_dummy_run: - self._execute_empty_input() + self._execute_empty_input(self.forward_meta) return None # 2. Padding inputs for cuda grph @@ -1231,14 +1231,14 @@ class at the server level, which is too granular for ModelRunner. return None - def _execute_empty_input(self) -> None: + def _execute_empty_input(self, forward_meta) -> None: """ In certain scenarios, such as during EP, the runner needs to execute partial modules of the model without input data. This requires the model to implement the `empty_input_forward` method. """ if hasattr(self.model, "empty_input_forward"): - self.model.empty_input_forward() + self.model.empty_input_forward(forward_meta) else: raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward")