diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 83af4ebdd50..62fb0139e3e 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -180,6 +180,7 @@ def __init__( ): self.model = "" self.is_quantized = False + self.is_moe_quantized = False self.max_model_len = 0 self.dtype = "bfloat16" self.enable_logprob = False diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index ede87972185..fcce5f0c3d2 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -564,7 +564,7 @@ def load_state_dict(self, state_dict, is_rearrange: bool = False): """ load_state_dict function. """ - if self.is_quantized: + if self.is_quantized or self.fd_config.model_config.is_moe_quantized: if getattr(self.fd_config.quant_config, "is_permuted", True): self.quant_method.process_prequanted_weights(self, state_dict, is_rearrange) else: diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py index f8716369852..5d882aed292 100644 --- a/fastdeploy/model_executor/layers/quantization/__init__.py +++ b/fastdeploy/model_executor/layers/quantization/__init__.py @@ -51,8 +51,14 @@ def parse_quant_config(args, model_config, is_ernie, is_v1_loader): if quantization_config is not None: if "is_quantized" in quantization_config: model_config.is_quantized = quantization_config["is_quantized"] + elif "is_moe_quantized" in quantization_config: + model_config.is_moe_quantized = quantization_config["is_moe_quantized"] elif "kv_cache_quant_type" not in quantization_config: model_config.is_quantized = True + if "is_moe_quantized" not in quantization_config: + model_config.is_quantized = True + else: + model_config.is_moe_quantized = True if quantization_config is not None and quantization_config.get("quantization", None) is None: raise ValueError( "quantization_config should have a key named 'quantization' for specify quant config." diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index 75947590be8..051337747c8 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -138,7 +138,8 @@ def __init__( "down_proj_expert_code_zp_key": f"{prefix}.experts.{{}}.down_proj.code_zp", } elif moe_quant_type == "tensor_wise_fp8" or ( - moe_quant_type == "block_wise_fp8" and fd_config.model_config.is_quantized + moe_quant_type == "block_wise_fp8" + and (fd_config.model_config.is_quantized or fd_config.model_config.is_moe_quantized) ): weight_key_map = { "gate_weight_key": f"{prefix}.gate.weight", diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index a291db0e9a5..e570748e908 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -105,7 +105,8 @@ def __init__( moe_quant_type = fd_config.quant_config.moe_quant_type if moe_quant_type == "tensor_wise_fp8" or ( - moe_quant_type == "block_wise_fp8" and fd_config.model_config.is_quantized + moe_quant_type == "block_wise_fp8" + and (fd_config.model_config.is_quantized or fd_config.model_config.is_moe_quantized) ): weight_key_map = { "gate_correction_bias_key": f"{prefix}.moe_statics.e_score_correction_bias",