From f7aa392e69b31dba6ca6b5a2faae7b527f154a0c Mon Sep 17 00:00:00 2001 From: yangqinghao-cmss Date: Thu, 6 Nov 2025 17:26:39 +0800 Subject: [PATCH] [bugfix] Fixed the bug in retrieving the quantization method for mlp.experts (e.g., DeepSeek_v3.2_exp w8a8) Signed-off-by: yangqinghao-cmss --- vllm_ascend/quantization/quant_config.py | 11 +++++++++++ vllm_ascend/quantization/utils.py | 10 ++++++++++ 2 files changed, 21 insertions(+) diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index e742852e91..fd53f88efc 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -157,6 +157,17 @@ def is_layer_skipped_ascend( f"Detected some but not all shards of {prefix} " "are quantized. All shards of fused layers " "to have the same precision.") + elif "experts" in prefix: + # For the experts' prefix (e.g., "model.layers.3.mlp.experts") + # Assume all experts within the same MLP use the same quantization method + experts_quant_description = [ + self.quant_description[layer] + for layer in self.quant_description if prefix in layer + ] + is_skipped = any( + quantization == "FLOAT" + for quantization in experts_quant_description + ) else: is_skipped = self.quant_description[prefix + '.weight'] == "FLOAT" diff --git a/vllm_ascend/quantization/utils.py b/vllm_ascend/quantization/utils.py index 6d914c0dad..749f5b257d 100644 --- a/vllm_ascend/quantization/utils.py +++ b/vllm_ascend/quantization/utils.py @@ -52,6 +52,16 @@ def get_linear_quant_type(quant_description: Dict[str, Any], prefix: str, f"Not all shards of {prefix} are quantized with same quant type." f"Shard {proj_name} uses {shard_quant_type}, but another shard" f"use {quant_type}. Please check quantization config.") + elif "experts" in prefix: + # For the experts' prefix (e.g., "model.layers.3.mlp.experts") + # Assume all experts within the same MLP use the same quantization method + experts_quant_description = set( + quant_description[layer] + for layer in quant_description if prefix in layer + ) + if not len(experts_quant_description) == 1: + raise RuntimeError(f"{prefix} has different quantization type: {experts_quant_description}.") + quant_type = experts_quant_description.pop() else: quant_type = quant_description[prefix + '.weight'] return quant_type