From 01c059d67ce3c4d497c0a513038c0bb4b4dcf8c7 Mon Sep 17 00:00:00 2001 From: lkk Date: Mon, 1 Dec 2025 14:18:31 +0000 Subject: [PATCH 1/9] add mxfp4 qat, mainly packing code. --- .../torch/algorithms/qat/mxfp4_packing.py | 24 +++++++++++++++++++ .../torch/algorithms/qat/quant_utils.py | 4 ++++ .../torch/algorithms/qat/tensor_quantizer.py | 11 +++++++++ neural_compressor/torch/export/export_hf.py | 8 ++++++- .../torch/quantization/config.py | 4 ++++ 5 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 neural_compressor/torch/algorithms/qat/mxfp4_packing.py diff --git a/neural_compressor/torch/algorithms/qat/mxfp4_packing.py b/neural_compressor/torch/algorithms/qat/mxfp4_packing.py new file mode 100644 index 00000000000..513416a4c0c --- /dev/null +++ b/neural_compressor/torch/algorithms/qat/mxfp4_packing.py @@ -0,0 +1,24 @@ +import torch + +E2M1_max = 6.0 + +E2M1_values = [0, 0.5, 1, 1.5, 2, 3, 4, 6] +E2M1_bounds = torch.tensor([0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5]) + +def cast_fp4(x): + sign = torch.sign(x) + sign_bit = (2 - sign) // 2 + ord_ = torch.sum( + (x.abs().unsqueeze(-1) - E2M1_bounds.to(x.device)) > 0, dim=-1 + ) + fp4_val = (sign_bit * 0b1000 + ord_).to(torch.uint8) + return fp4_val + +def fuse_uint4_to_uint8(x): + # If the last dimension is odd, pad with zeros + # If this behavior is not desired, please modify the code accordingly + left_side = x[..., 0::2] # Even indices (0, 2, 4...) + right_side = x[..., 1::2] # Odd indices (1, 3, 5...) + new_data = right_side.clone() << 4 # Put odd indices (higher addresses) in high bits + new_data[..., : left_side.shape[-1]] += left_side # Put even indices in low bits + return new_data diff --git a/neural_compressor/torch/algorithms/qat/quant_utils.py b/neural_compressor/torch/algorithms/qat/quant_utils.py index bf99a36d5b3..e9679823b29 100644 --- a/neural_compressor/torch/algorithms/qat/quant_utils.py +++ b/neural_compressor/torch/algorithms/qat/quant_utils.py @@ -108,6 +108,7 @@ def get_quant_config(scheme: str) -> dict[str, Any]: quantization_config["provider"] = "auto-round" quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] = True quantization_config["config_groups"]["group_0"]["input_activations"]["is_mx"] = True + quantization_config["format"] = "float-quantized" except ImportError: quantization_config = None @@ -133,6 +134,9 @@ def _get_quantization_from_layer(layer): if weight_quantizer.num_bits == 8 and weight_quantizer.data_type == "mx_fp8": return "MXFP8" + if weight_quantizer.num_bits == 4 and weight_quantizer.data_type == "mx_fp4": + return "MXFP4" + # Raise error for unsupported num_bits raise NotImplementedError(f"Unsupported quantizer with num_bits: {weight_quantizer.num_bits}") diff --git a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py index e8c0badad28..6cb5129b471 100644 --- a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py +++ b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py @@ -161,6 +161,17 @@ def weight_pack(self, weight, scale): e8m0_scale = (scale + 127).to(torch.uint8) return qweight.reshape(original_shape), e8m0_scale.reshape(original_shape[0], -1) + if self.data_type == "mx_fp4": + qweight = weight.reshape(-1, self.block_size) \ + / torch.exp2(scale.float()) + + from .mxfp4_packing import cast_fp4, fuse_uint4_to_uint8 + qweight = cast_fp4(qweight) + qweight_packed = fuse_uint4_to_uint8(qweight) + + e8m0_scale = (scale + 127).to(torch.uint8) + return qweight_packed.reshape(original_shape[0], original_shape[1] // 2), e8m0_scale.reshape(original_shape[0], -1) + def __repr__(self): if self._disabled or not self._if_quant: return "TensorQuantizer(disabled)" diff --git a/neural_compressor/torch/export/export_hf.py b/neural_compressor/torch/export/export_hf.py index e617ae122a9..2d616f0865e 100644 --- a/neural_compressor/torch/export/export_hf.py +++ b/neural_compressor/torch/export/export_hf.py @@ -41,7 +41,13 @@ def _export_quantized_weight(sub_module: nn.Module, quantization_format: str = N sub_module.register_buffer("weight_scale", e8m0_scale) - setattr(sub_module, weight_name, nn.Parameter(quantized_weight, requires_grad=False)) + if quantization_format == "MXFP8": + setattr(sub_module, weight_name, nn.Parameter(quantized_weight, requires_grad=False)) + + if quantization_format == "MXFP4": + delattr(sub_module, weight_name) + # name aligned for vllm emulation + sub_module.register_buffer("weight_packed", quantized_weight) def _export_hf_checkpoint(model: nn.Module, scheme: str | None = None) -> tuple[dict[str, Any], dict[str, Any]]: diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index dd1bc132776..65cff33f22f 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -2254,6 +2254,10 @@ def get_config_set_for_tuning(cls, dtype="int8"): torch.nn.Linear: "MXFP8", } +QAT_MODULE_MAPPINGS: dict[Callable, Any] = { + torch.nn.Linear: ["MXFP8", "MXFP4"], +} + def get_default_qat_module_mappings() -> dict[Callable, Any]: """Get default module mapping for quantization aware training.""" From c2c7ceee36fcdcc509dda7f6aed0a39df9addcc0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 1 Dec 2025 14:21:06 +0000 Subject: [PATCH 2/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../torch/algorithms/qat/mxfp4_packing.py | 20 ++++++++++++++++--- .../torch/algorithms/qat/tensor_quantizer.py | 8 +++++--- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/neural_compressor/torch/algorithms/qat/mxfp4_packing.py b/neural_compressor/torch/algorithms/qat/mxfp4_packing.py index 513416a4c0c..1d3f10a9020 100644 --- a/neural_compressor/torch/algorithms/qat/mxfp4_packing.py +++ b/neural_compressor/torch/algorithms/qat/mxfp4_packing.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import torch E2M1_max = 6.0 @@ -5,15 +19,15 @@ E2M1_values = [0, 0.5, 1, 1.5, 2, 3, 4, 6] E2M1_bounds = torch.tensor([0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5]) + def cast_fp4(x): sign = torch.sign(x) sign_bit = (2 - sign) // 2 - ord_ = torch.sum( - (x.abs().unsqueeze(-1) - E2M1_bounds.to(x.device)) > 0, dim=-1 - ) + ord_ = torch.sum((x.abs().unsqueeze(-1) - E2M1_bounds.to(x.device)) > 0, dim=-1) fp4_val = (sign_bit * 0b1000 + ord_).to(torch.uint8) return fp4_val + def fuse_uint4_to_uint8(x): # If the last dimension is odd, pad with zeros # If this behavior is not desired, please modify the code accordingly diff --git a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py index 6cb5129b471..3405f8ecc45 100644 --- a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py +++ b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py @@ -162,15 +162,17 @@ def weight_pack(self, weight, scale): return qweight.reshape(original_shape), e8m0_scale.reshape(original_shape[0], -1) if self.data_type == "mx_fp4": - qweight = weight.reshape(-1, self.block_size) \ - / torch.exp2(scale.float()) + qweight = weight.reshape(-1, self.block_size) / torch.exp2(scale.float()) from .mxfp4_packing import cast_fp4, fuse_uint4_to_uint8 + qweight = cast_fp4(qweight) qweight_packed = fuse_uint4_to_uint8(qweight) e8m0_scale = (scale + 127).to(torch.uint8) - return qweight_packed.reshape(original_shape[0], original_shape[1] // 2), e8m0_scale.reshape(original_shape[0], -1) + return qweight_packed.reshape(original_shape[0], original_shape[1] // 2), e8m0_scale.reshape( + original_shape[0], -1 + ) def __repr__(self): if self._disabled or not self._if_quant: From af6ef4efcdbab77ba9a9ee63fd25151dc957f559 Mon Sep 17 00:00:00 2001 From: lkk Date: Tue, 2 Dec 2025 13:41:01 +0000 Subject: [PATCH 3/9] add mxfp4 example and override trainer.save_model for saving. --- .../quantization/llm_qat/main.py | 17 ++-- .../quantization/llm_qat/utils.py | 77 ++++++++++++++++++- 2 files changed, 88 insertions(+), 6 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py index b00ce1936f4..e37a46f1451 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py @@ -12,7 +12,6 @@ AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, - Trainer, default_data_collator, set_seed, TrainerCallback, @@ -21,6 +20,7 @@ from utils import ( get_metrics_with_perplexity, make_supervised_data_module, + QATTrainer ) logger = logging.getLogger(__name__) @@ -69,7 +69,7 @@ class QuantizationArguments: "Specify the quantization format for PTQ/QAT. if specified, PTQ/QAT will be enabled" " with the specified quantization format" ), - "choices": ["MXFP8"], + "choices": ["MXFP8", "MXFP4"], }, ) @@ -124,9 +124,16 @@ def train(): # prepare model for quantization if quant_args.quant_scheme is not None: from neural_compressor.torch.quantization.quantize import prepare_qat + + model.train() # inplace - # default mxfp8 - prepare_qat(model) + if quant_args.quant_scheme == "MXFP8": + # default mxfp8 + prepare_qat(model) + if quant_args.quant_scheme == "MXFP4": + mappings = {torch.nn.Linear: "MXFP4"} + prepare_qat(model, mappings) + logger.info("Finish model preparation for QAT.") @@ -154,7 +161,7 @@ def train(): if training_args.gradient_checkpointing and training_args.gradient_checkpointing_kwargs is None: training_args.gradient_checkpointing_kwargs = {"use_reentrant": True} - trainer = Trainer( + trainer = QATTrainer( model=model, processing_class=tokenizer, args=training_args, diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/utils.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/utils.py index 96a0c72a703..9c46874f13e 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/utils.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/utils.py @@ -22,7 +22,7 @@ import torch import transformers -from transformers import default_data_collator +from transformers import default_data_collator, Trainer IGNORE_INDEX = -100 @@ -146,3 +146,78 @@ def get_metrics_with_perplexity(metrics): if "eval_loss" in metrics: metrics["perplexity"] = float(torch.exp(torch.tensor(metrics["eval_loss"]))) return metrics + + +def print_rank_0(*args, **kwargs): + """Prints only on the master process.""" + + if torch.distributed.is_available() and torch.distributed.is_initialized(): + if torch.distributed.get_rank(group=None) == 0: + print(*args, **kwargs, flush=True) + else: + print(*args, **kwargs, flush=True) + +class QATTrainer(Trainer): + """A drop-in replacement of HuggingFace's Trainer for ModelOpt. + + This class adds extra utilities for ModelOpt checkpointing and memory reporting. + """ + + def __init__(self, *args, **kwargs): + """Initialize.""" + # enable_huggingface_checkpointing() + super().__init__(*args, **kwargs) + + self._original_dtype = getattr( + getattr(self.model, "config", None), "dtype", None + ) or getattr(getattr(self.model, "config", None), "torch_dtype", None) + + def save_model(self, *args, **kwargs): + """Save the quantized model.""" + if ( + (not self.is_in_train) + and self.is_fsdp_enabled + and self.accelerator.state.fsdp_plugin.state_dict_type != "FULL_STATE_DICT" + ): + print_rank_0("Setting state_dict_type to FULL_STATE_DICT for final checkpoint save.") + original_type = self.accelerator.state.fsdp_plugin.state_dict_type + self.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT") + outputs = super().save_model(*args, **kwargs) + if torch.distributed.is_initialized(): + torch.distributed.barrier() + + self.accelerator.state.fsdp_plugin.set_state_dict_type(original_type) + else: + outputs = super().save_model(*args, **kwargs) + if (not self.is_in_train) and self.args.should_save: + out_dir = args[0] + # FSDP may upcast parameter dtype to float32 during mixed-precision training, + # we convert it back to original dtype by updating `torch-dtype` in `config.json` + self._update_config_json_dtype(out_dir, str(self._original_dtype).split(".")[1]) + return outputs + + def _update_config_json_dtype(self, output_dir: str, dtype_str: str | None) -> None: + """Rewrite /config.json 'dtype' (preferred) or 'torch_dtype' to dtype_str.""" + cfg_path = os.path.join(output_dir, "config.json") + if not os.path.isfile(cfg_path): + print_rank_0(f"[warn] config.json not found under {output_dir}; skip dtype rewrite.") + return + try: + with open(cfg_path, encoding="utf-8") as f: + data = json.load(f) + # Prefer 'dtype', else fall back to 'torch_dtype' + key_to_update = ( + "dtype" if "dtype" in data else ("torch_dtype" if "torch_dtype" in data else None) + ) + if key_to_update is None: + print_rank_0( + "[warn] Neither 'dtype' nor 'torch_dtype' present in config.json; skip dtype rewrite." + ) + return + if data.get(key_to_update) != dtype_str: + data[key_to_update] = dtype_str + with open(cfg_path, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + print_rank_0(f'Updated config.json: {key_to_update} -> "{dtype_str}"') + except Exception as e: + print_rank_0(f"[warn] Failed to update dtype in config.json: {e}") From e758d17d29bda86f53447b685c55c14b1a1ea28a Mon Sep 17 00:00:00 2001 From: lkk Date: Wed, 3 Dec 2025 08:20:27 +0000 Subject: [PATCH 4/9] refine docs. --- .../llm_qat/accelerate_config/fsdp2.yaml | 25 +++++++++++++++++++ .../quantization/llm_qat/main.py | 11 ++------ .../quantization/llm_qat/utils.py | 2 +- 3 files changed, 28 insertions(+), 10 deletions(-) create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/accelerate_config/fsdp2.yaml diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/accelerate_config/fsdp2.yaml b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/accelerate_config/fsdp2.yaml new file mode 100644 index 00000000000..3c901d61760 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/accelerate_config/fsdp2.yaml @@ -0,0 +1,25 @@ +compute_environment: LOCAL_MACHINE +debug: false +distributed_type: FSDP +downcast_bf16: 'no' +enable_cpu_affinity: false +fsdp_config: + fsdp_activation_checkpointing: true + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_cpu_ram_efficient_loading: true + fsdp_offload_params: false + fsdp_reshard_after_forward: true + fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer + fsdp_version: 2 +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: gpu +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py index e37a46f1451..4d874cb81e1 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py @@ -179,15 +179,8 @@ def train(): metrics = get_metrics_with_perplexity(metrics) logger.info(f"Evaluation results: \n{metrics}") - if training_args.do_train and quant_args.quant_scheme is None: - logger.info("Saving the model...") - trainer.save_model(training_args.output_dir) - elif quant_args.quant_scheme is not None: - from neural_compressor.torch.export.export_hf import export_hf2compressored_model - # export quantized model for vllm inference using llm-compressor and compressed_tensor - export_hf2compressored_model(model, training_args.output_dir, quant_args.quant_scheme) - if tokenizer is not None: - tokenizer.save_pretrained(training_args.output_dir) + logger.info("Saving the model...") + trainer.save_model(training_args.output_dir) if __name__ == "__main__": diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/utils.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/utils.py index 9c46874f13e..81ced7b2cff 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/utils.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/utils.py @@ -17,7 +17,7 @@ import types from contextlib import contextmanager from functools import partial - +import json import datasets import torch From 64cd6b7bcc4e7445afd83944fbe16ae6f3a02898 Mon Sep 17 00:00:00 2001 From: lkk Date: Wed, 3 Dec 2025 08:40:06 +0000 Subject: [PATCH 5/9] refine doc. --- .../quantization/llm_qat/README.md | 35 +++---------------- .../quantization/llm_qat/main.py | 2 +- .../llm_qat/quantize_autoround.py | 10 ++++++ 3 files changed, 15 insertions(+), 32 deletions(-) create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/quantize_autoround.py diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/README.md index 2e40d5ba14c..19eb1513e92 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/README.md @@ -54,43 +54,16 @@ accelerate launch --config-file accelerate_config/fsdp1.yaml \ ##### Step 2: -Quantize the trained model using `prepare_qat()` by setting the following flags `--quant_scheme MXFP8 --do_train False`. This inserts fake quantization modules into the model without starting training yet. Then save the model directly to a get post training quantization model. +Save the model directly to a get post training quantization model with using [auto-round](https://github.com/intel/auto-round). ``` -accelerate launch --config-file accelerate_config/fsdp1.yaml \ - --fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer \ - main.py \ - --model_name_or_path ./llama3.1-finetuned \ - --model_max_length 4096 \ - --dataloader_drop_last True \ - --do_train False \ - --do_eval False \ - --quant_scheme MXFP8 \ - --output_dir ./llama3.1-finetuned-ptq \ - --dataset Daring-Anteater \ - --num_train_epochs 2.0 \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --eval_accumulation_steps 1 \ - --save_strategy steps \ - --save_steps 3000 \ - --eval_strategy steps \ - --eval_steps 3000 \ - --load_best_model_at_end True \ - --save_total_limit 2 \ - --learning_rate 1e-5 \ - --weight_decay 0.0 \ - --warmup_ratio 0.1 \ - --lr_scheduler_type linear \ - --logging_steps 1 \ - --report_to tensorboard +python quantize_autoround.py ``` ##### Step 3: -Train/fine-tune the quantized model with a small learning rate, e.g. 1e-5 for Adam optimizer by setting `--quant_scheme MXFP8 --do_train True` +Train/fine-tune the quantized model with a small learning rate, e.g. 1e-5 for Adam optimizer by setting `--quant_scheme MXFP4 --do_train True` ``` accelerate launch --config-file accelerate_config/fsdp1.yaml \ @@ -101,7 +74,7 @@ accelerate launch --config-file accelerate_config/fsdp1.yaml \ --dataloader_drop_last True \ --do_train True \ --do_eval True \ - --quant_scheme MXFP8 \ + --quant_scheme MXFP4 \ --output_dir ./llama3.1-finetuned-qat \ --dataset Daring-Anteater \ --max_steps 1000 \ diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py index 4d874cb81e1..2d8a0a4bd7e 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py @@ -47,7 +47,7 @@ class TrainingArguments(transformers.TrainingArguments): class DataArguments: dataset: str = field( default="Daring-Anteater", - metadata={"help": "Specify the dataset.", "choices": ["Daring-Anteater"]}, + metadata={"help": "Specify the dataset.", "choices": ["Daring-Anteater", "cnn_dailymail"]}, ) train_size: int = field( default=0, diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/quantize_autoround.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/quantize_autoround.py new file mode 100644 index 00000000000..bb4f846df83 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/quantize_autoround.py @@ -0,0 +1,10 @@ + +from auto_round import AutoRound + +model_name_or_path = "./llama3.1-finetuned" +output_dir = "./Llama-3.1-8B-Instruct_autoround_rtn_mxfp4" + +# Available schemes: "W2A16", "W3A16", "W4A16", "W8A16", "NVFP4", "MXFP4" (no real kernels), "GGUF:Q4_K_M", etc. +ar = AutoRound(model_name_or_path, scheme="MXFP4", iters=0) + +ar.quantize_and_save(output_dir=output_dir, format="llm_compressor") From 8ae016a79b18fcdcec78625e6c5d7a2a26a74c1e Mon Sep 17 00:00:00 2001 From: lkk Date: Wed, 3 Dec 2025 08:44:52 +0000 Subject: [PATCH 6/9] add ut. --- test/3x/torch/algorithms/qat/test_qat.py | 25 ++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/test/3x/torch/algorithms/qat/test_qat.py b/test/3x/torch/algorithms/qat/test_qat.py index 83fc1dd4348..f5443d5e13c 100644 --- a/test/3x/torch/algorithms/qat/test_qat.py +++ b/test/3x/torch/algorithms/qat/test_qat.py @@ -65,3 +65,28 @@ def test_train(): for name, param in model.named_parameters(): assert param.grad is not None optimizer.step() + + +def test_train_mxfp4(): + """QAT test.""" + setup_seed(20) + + model = TinyModel() + mappings = {torch.nn.Linear: "MXFP4"} + prepare_qat(model, mappings) + + inp = torch.randn([2, 32]) + + optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5) + + with torch.autocast(device_type="cpu", dtype=torch.bfloat16): + output = model(inp) + loss = output.mean() + + optimizer.zero_grad() + loss.backward() + + # check the grad + for name, param in model.named_parameters(): + assert param.grad is not None + optimizer.step() From 9d7b4a2cd4d2c369841e91c12b2138f50f2cda23 Mon Sep 17 00:00:00 2001 From: lkk Date: Wed, 3 Dec 2025 08:55:42 +0000 Subject: [PATCH 7/9] replace fp4 packing func. --- .../torch/algorithms/qat/mxfp4_packing.py | 38 ------------------- .../torch/algorithms/qat/tensor_quantizer.py | 6 +-- 2 files changed, 2 insertions(+), 42 deletions(-) delete mode 100644 neural_compressor/torch/algorithms/qat/mxfp4_packing.py diff --git a/neural_compressor/torch/algorithms/qat/mxfp4_packing.py b/neural_compressor/torch/algorithms/qat/mxfp4_packing.py deleted file mode 100644 index 1d3f10a9020..00000000000 --- a/neural_compressor/torch/algorithms/qat/mxfp4_packing.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2025 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch - -E2M1_max = 6.0 - -E2M1_values = [0, 0.5, 1, 1.5, 2, 3, 4, 6] -E2M1_bounds = torch.tensor([0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5]) - - -def cast_fp4(x): - sign = torch.sign(x) - sign_bit = (2 - sign) // 2 - ord_ = torch.sum((x.abs().unsqueeze(-1) - E2M1_bounds.to(x.device)) > 0, dim=-1) - fp4_val = (sign_bit * 0b1000 + ord_).to(torch.uint8) - return fp4_val - - -def fuse_uint4_to_uint8(x): - # If the last dimension is odd, pad with zeros - # If this behavior is not desired, please modify the code accordingly - left_side = x[..., 0::2] # Even indices (0, 2, 4...) - right_side = x[..., 1::2] # Odd indices (1, 3, 5...) - new_data = right_side.clone() << 4 # Put odd indices (higher addresses) in high bits - new_data[..., : left_side.shape[-1]] += left_side # Put even indices in low bits - return new_data diff --git a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py index 3405f8ecc45..b270c4878aa 100644 --- a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py +++ b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py @@ -164,10 +164,8 @@ def weight_pack(self, weight, scale): if self.data_type == "mx_fp4": qweight = weight.reshape(-1, self.block_size) / torch.exp2(scale.float()) - from .mxfp4_packing import cast_fp4, fuse_uint4_to_uint8 - - qweight = cast_fp4(qweight) - qweight_packed = fuse_uint4_to_uint8(qweight) + from auto_round.export.export_to_autoround.qlinear_fp import pack_fp4_to_uint8 + qweight_packed = pack_fp4_to_uint8(qweight) e8m0_scale = (scale + 127).to(torch.uint8) return qweight_packed.reshape(original_shape[0], original_shape[1] // 2), e8m0_scale.reshape( From 12a061e038673c08d0474d621c427806183fcd68 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Dec 2025 08:59:01 +0000 Subject: [PATCH 8/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_compressor/torch/algorithms/qat/tensor_quantizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py index b270c4878aa..2146ab80b7a 100644 --- a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py +++ b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py @@ -165,6 +165,7 @@ def weight_pack(self, weight, scale): qweight = weight.reshape(-1, self.block_size) / torch.exp2(scale.float()) from auto_round.export.export_to_autoround.qlinear_fp import pack_fp4_to_uint8 + qweight_packed = pack_fp4_to_uint8(qweight) e8m0_scale = (scale + 127).to(torch.uint8) From bda4d9352b001e58aedb6f547cb27d9f71e602f9 Mon Sep 17 00:00:00 2001 From: lkk Date: Fri, 5 Dec 2025 08:57:51 +0000 Subject: [PATCH 9/9] add more ut. --- .../torch/algorithms/qat/test_quant_utils.py | 11 ++++++++++ .../qat/test_quantizer_and_linear.py | 21 +++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/test/3x/torch/algorithms/qat/test_quant_utils.py b/test/3x/torch/algorithms/qat/test_quant_utils.py index cca51126caf..13ffc89b8c0 100644 --- a/test/3x/torch/algorithms/qat/test_quant_utils.py +++ b/test/3x/torch/algorithms/qat/test_quant_utils.py @@ -206,3 +206,14 @@ def test_get_quantization_format_disabled_returns_none(disabled): assert fmt is None else: assert fmt == "MXFP8" + + layer.weight_quantizer = TensorQuantizer(bits=4, data_type="mx_fp4") + layer.weight_quantizer._disabled = disabled + layer.input_quantizer = TensorQuantizer(bits=4, data_type="mx_fp4") + layer.input_quantizer._disabled = disabled + + fmt = quant_utils.get_quantization_format(layer) + if disabled: + assert fmt is None + else: + assert fmt == "MXFP4" diff --git a/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py b/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py index 8f5c6108ba8..17097ea7bee 100644 --- a/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py +++ b/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py @@ -163,3 +163,24 @@ def test_tensor_quantizer_scale_persistence(): assert tq.scale.dtype == torch.uint8 # Heuristic: at least one non-zero (if all zero it may still be valid, but improbable) assert (tq.scale != 0).any() or (shared_exp == 0).all() + + +def test_weight_pack(): + # Provide scale_shape so internal buffer is registered & updated + tq = TensorQuantizer(scale_shape=(4, 32), block_size=32) + x = torch.randn(4, 32) + # Use internal fake quant function to generate scale + q, shared_exp = tq._fake_quantize(x) + + q_packed, scale = tq.weight_pack(q, shared_exp) + + assert q_packed.dtype == torch.float8_e4m3fn + + tq = TensorQuantizer(data_type="mx_fp4", bits=4, scale_shape=(4, 32), block_size=32) + x = torch.randn(4, 32) + # Use internal fake quant function to generate scale + q, shared_exp = tq._fake_quantize(x) + + q_packed, scale = tq.weight_pack(q, shared_exp) + + assert q_packed.dtype == torch.uint8