From 01c059d67ce3c4d497c0a513038c0bb4b4dcf8c7 Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Mon, 1 Dec 2025 14:18:31 +0000
Subject: [PATCH 1/9] add mxfp4 qat, mainly packing code.

---
 .../torch/algorithms/qat/mxfp4_packing.py     | 24 +++++++++++++++++++
 .../torch/algorithms/qat/quant_utils.py       |  4 ++++
 .../torch/algorithms/qat/tensor_quantizer.py  | 11 +++++++++
 neural_compressor/torch/export/export_hf.py   |  8 ++++++-
 .../torch/quantization/config.py              |  4 ++++
 5 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 neural_compressor/torch/algorithms/qat/mxfp4_packing.py

diff --git a/neural_compressor/torch/algorithms/qat/mxfp4_packing.py b/neural_compressor/torch/algorithms/qat/mxfp4_packing.py
new file mode 100644
index 00000000000..513416a4c0c
--- /dev/null
+++ b/neural_compressor/torch/algorithms/qat/mxfp4_packing.py
@@ -0,0 +1,24 @@
+import torch
+
+E2M1_max = 6.0
+
+E2M1_values = [0, 0.5, 1, 1.5, 2, 3, 4, 6]
+E2M1_bounds = torch.tensor([0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5])
+
+def cast_fp4(x):
+    sign = torch.sign(x)
+    sign_bit = (2 - sign) // 2
+    ord_ = torch.sum(
+        (x.abs().unsqueeze(-1) - E2M1_bounds.to(x.device)) > 0, dim=-1
+    )
+    fp4_val = (sign_bit * 0b1000 + ord_).to(torch.uint8)
+    return fp4_val
+
+def fuse_uint4_to_uint8(x):
+    # If the last dimension is odd, pad with zeros
+    # If this behavior is not desired, please modify the code accordingly
+    left_side = x[..., 0::2]  # Even indices (0, 2, 4...)
+    right_side = x[..., 1::2]  # Odd indices (1, 3, 5...)
+    new_data = right_side.clone() << 4  # Put odd indices (higher addresses) in high bits
+    new_data[..., : left_side.shape[-1]] += left_side  # Put even indices in low bits
+    return new_data
diff --git a/neural_compressor/torch/algorithms/qat/quant_utils.py b/neural_compressor/torch/algorithms/qat/quant_utils.py
index bf99a36d5b3..e9679823b29 100644
--- a/neural_compressor/torch/algorithms/qat/quant_utils.py
+++ b/neural_compressor/torch/algorithms/qat/quant_utils.py
@@ -108,6 +108,7 @@ def get_quant_config(scheme: str) -> dict[str, Any]:
         quantization_config["provider"] = "auto-round"
         quantization_config["config_groups"]["group_0"]["weights"]["is_mx"] = True
         quantization_config["config_groups"]["group_0"]["input_activations"]["is_mx"] = True
+        quantization_config["format"] = "float-quantized"
 
     except ImportError:
         quantization_config = None
@@ -133,6 +134,9 @@ def _get_quantization_from_layer(layer):
         if weight_quantizer.num_bits == 8 and weight_quantizer.data_type == "mx_fp8":
             return "MXFP8"
 
+        if weight_quantizer.num_bits == 4 and weight_quantizer.data_type == "mx_fp4":
+            return "MXFP4"
+
         # Raise error for unsupported num_bits
         raise NotImplementedError(f"Unsupported quantizer with num_bits: {weight_quantizer.num_bits}")
 
diff --git a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
index e8c0badad28..6cb5129b471 100644
--- a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
+++ b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
@@ -161,6 +161,17 @@ def weight_pack(self, weight, scale):
             e8m0_scale = (scale + 127).to(torch.uint8)
             return qweight.reshape(original_shape), e8m0_scale.reshape(original_shape[0], -1)
 
+        if self.data_type == "mx_fp4":
+            qweight = weight.reshape(-1, self.block_size) \
+                    / torch.exp2(scale.float())
+
+            from .mxfp4_packing import cast_fp4, fuse_uint4_to_uint8
+            qweight = cast_fp4(qweight)
+            qweight_packed = fuse_uint4_to_uint8(qweight)
+
+            e8m0_scale = (scale + 127).to(torch.uint8)
+            return qweight_packed.reshape(original_shape[0], original_shape[1] // 2), e8m0_scale.reshape(original_shape[0], -1)
+
     def __repr__(self):
         if self._disabled or not self._if_quant:
             return "TensorQuantizer(disabled)"
diff --git a/neural_compressor/torch/export/export_hf.py b/neural_compressor/torch/export/export_hf.py
index e617ae122a9..2d616f0865e 100644
--- a/neural_compressor/torch/export/export_hf.py
+++ b/neural_compressor/torch/export/export_hf.py
@@ -41,7 +41,13 @@ def _export_quantized_weight(sub_module: nn.Module, quantization_format: str = N
 
     sub_module.register_buffer("weight_scale", e8m0_scale)
 
-    setattr(sub_module, weight_name, nn.Parameter(quantized_weight, requires_grad=False))
+    if quantization_format == "MXFP8":
+        setattr(sub_module, weight_name, nn.Parameter(quantized_weight, requires_grad=False))
+
+    if quantization_format == "MXFP4":
+        delattr(sub_module, weight_name)
+        # name aligned for vllm emulation
+        sub_module.register_buffer("weight_packed", quantized_weight)
 
 
 def _export_hf_checkpoint(model: nn.Module, scheme: str | None = None) -> tuple[dict[str, Any], dict[str, Any]]:
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index dd1bc132776..65cff33f22f 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -2254,6 +2254,10 @@ def get_config_set_for_tuning(cls, dtype="int8"):
     torch.nn.Linear: "MXFP8",
 }
 
+QAT_MODULE_MAPPINGS: dict[Callable, Any] = {
+    torch.nn.Linear: ["MXFP8", "MXFP4"],
+}
+
 
 def get_default_qat_module_mappings() -> dict[Callable, Any]:
     """Get default module mapping for quantization aware training."""

From c2c7ceee36fcdcc509dda7f6aed0a39df9addcc0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 1 Dec 2025 14:21:06 +0000
Subject: [PATCH 2/9] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../torch/algorithms/qat/mxfp4_packing.py     | 20 ++++++++++++++++---
 .../torch/algorithms/qat/tensor_quantizer.py  |  8 +++++---
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/neural_compressor/torch/algorithms/qat/mxfp4_packing.py b/neural_compressor/torch/algorithms/qat/mxfp4_packing.py
index 513416a4c0c..1d3f10a9020 100644
--- a/neural_compressor/torch/algorithms/qat/mxfp4_packing.py
+++ b/neural_compressor/torch/algorithms/qat/mxfp4_packing.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 
 E2M1_max = 6.0
@@ -5,15 +19,15 @@
 E2M1_values = [0, 0.5, 1, 1.5, 2, 3, 4, 6]
 E2M1_bounds = torch.tensor([0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5])
 
+
 def cast_fp4(x):
     sign = torch.sign(x)
     sign_bit = (2 - sign) // 2
-    ord_ = torch.sum(
-        (x.abs().unsqueeze(-1) - E2M1_bounds.to(x.device)) > 0, dim=-1
-    )
+    ord_ = torch.sum((x.abs().unsqueeze(-1) - E2M1_bounds.to(x.device)) > 0, dim=-1)
     fp4_val = (sign_bit * 0b1000 + ord_).to(torch.uint8)
     return fp4_val
 
+
 def fuse_uint4_to_uint8(x):
     # If the last dimension is odd, pad with zeros
     # If this behavior is not desired, please modify the code accordingly
diff --git a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
index 6cb5129b471..3405f8ecc45 100644
--- a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
+++ b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
@@ -162,15 +162,17 @@ def weight_pack(self, weight, scale):
             return qweight.reshape(original_shape), e8m0_scale.reshape(original_shape[0], -1)
 
         if self.data_type == "mx_fp4":
-            qweight = weight.reshape(-1, self.block_size) \
-                    / torch.exp2(scale.float())
+            qweight = weight.reshape(-1, self.block_size) / torch.exp2(scale.float())
 
             from .mxfp4_packing import cast_fp4, fuse_uint4_to_uint8
+
             qweight = cast_fp4(qweight)
             qweight_packed = fuse_uint4_to_uint8(qweight)
 
             e8m0_scale = (scale + 127).to(torch.uint8)
-            return qweight_packed.reshape(original_shape[0], original_shape[1] // 2), e8m0_scale.reshape(original_shape[0], -1)
+            return qweight_packed.reshape(original_shape[0], original_shape[1] // 2), e8m0_scale.reshape(
+                original_shape[0], -1
+            )
 
     def __repr__(self):
         if self._disabled or not self._if_quant:

From af6ef4efcdbab77ba9a9ee63fd25151dc957f559 Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Tue, 2 Dec 2025 13:41:01 +0000
Subject: [PATCH 3/9] add mxfp4 example and override trainer.save_model for
 saving.

---
 .../quantization/llm_qat/main.py              | 17 ++--
 .../quantization/llm_qat/utils.py             | 77 ++++++++++++++++++-
 2 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py
index b00ce1936f4..e37a46f1451 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py
@@ -12,7 +12,6 @@
     AutoModelForCausalLM,
     AutoTokenizer,
     HfArgumentParser,
-    Trainer,
     default_data_collator,
     set_seed,
     TrainerCallback,
@@ -21,6 +20,7 @@
 from utils import (
     get_metrics_with_perplexity,
     make_supervised_data_module,
+    QATTrainer
 )
 
 logger = logging.getLogger(__name__)
@@ -69,7 +69,7 @@ class QuantizationArguments:
                 "Specify the quantization format for PTQ/QAT. if specified, PTQ/QAT will be enabled"
                 " with the specified quantization format"
             ),
-            "choices": ["MXFP8"],
+            "choices": ["MXFP8", "MXFP4"],
         },
     )
 
@@ -124,9 +124,16 @@ def train():
     # prepare model for quantization
     if quant_args.quant_scheme is not None:
         from neural_compressor.torch.quantization.quantize import prepare_qat
+
+        model.train()
         # inplace
-        # default mxfp8
-        prepare_qat(model)
+        if quant_args.quant_scheme == "MXFP8":
+            # default mxfp8
+            prepare_qat(model)
+        if quant_args.quant_scheme == "MXFP4":
+            mappings = {torch.nn.Linear: "MXFP4"}
+            prepare_qat(model, mappings)
+
 
         logger.info("Finish model preparation for QAT.")
 
@@ -154,7 +161,7 @@ def train():
     if training_args.gradient_checkpointing and training_args.gradient_checkpointing_kwargs is None:
         training_args.gradient_checkpointing_kwargs = {"use_reentrant": True}
 
-    trainer = Trainer(
+    trainer = QATTrainer(
         model=model,
         processing_class=tokenizer,
         args=training_args,
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/utils.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/utils.py
index 96a0c72a703..9c46874f13e 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/utils.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/utils.py
@@ -22,7 +22,7 @@
 import torch
 
 import transformers
-from transformers import default_data_collator
+from transformers import default_data_collator, Trainer
 
 IGNORE_INDEX = -100
 
@@ -146,3 +146,78 @@ def get_metrics_with_perplexity(metrics):
     if "eval_loss" in metrics:
         metrics["perplexity"] = float(torch.exp(torch.tensor(metrics["eval_loss"])))
     return metrics
+
+
+def print_rank_0(*args, **kwargs):
+    """Prints only on the master process."""
+
+    if torch.distributed.is_available() and  torch.distributed.is_initialized():
+        if torch.distributed.get_rank(group=None) == 0:
+            print(*args, **kwargs, flush=True)
+    else:
+        print(*args, **kwargs, flush=True)
+
+class QATTrainer(Trainer):
+    """A drop-in replacement of HuggingFace's Trainer for ModelOpt.
+
+    This class adds extra utilities for ModelOpt checkpointing and memory reporting.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """Initialize."""
+        # enable_huggingface_checkpointing()
+        super().__init__(*args, **kwargs)
+
+        self._original_dtype = getattr(
+            getattr(self.model, "config", None), "dtype", None
+        ) or getattr(getattr(self.model, "config", None), "torch_dtype", None)
+
+    def save_model(self, *args, **kwargs):
+        """Save the quantized model."""
+        if (
+            (not self.is_in_train)
+            and self.is_fsdp_enabled
+            and self.accelerator.state.fsdp_plugin.state_dict_type != "FULL_STATE_DICT"
+        ):
+            print_rank_0("Setting state_dict_type to FULL_STATE_DICT for final checkpoint save.")
+            original_type = self.accelerator.state.fsdp_plugin.state_dict_type
+            self.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
+            outputs = super().save_model(*args, **kwargs)
+            if torch.distributed.is_initialized():
+                torch.distributed.barrier()
+
+            self.accelerator.state.fsdp_plugin.set_state_dict_type(original_type)
+        else:
+            outputs = super().save_model(*args, **kwargs)
+        if (not self.is_in_train) and self.args.should_save:
+            out_dir = args[0]
+            # FSDP may upcast parameter dtype to float32 during mixed-precision training,
+            # we convert it back to original dtype by updating `torch-dtype` in `config.json`
+            self._update_config_json_dtype(out_dir, str(self._original_dtype).split(".")[1])
+        return outputs
+
+    def _update_config_json_dtype(self, output_dir: str, dtype_str: str | None) -> None:
+        """Rewrite <output_dir>/config.json 'dtype' (preferred) or 'torch_dtype' to dtype_str."""
+        cfg_path = os.path.join(output_dir, "config.json")
+        if not os.path.isfile(cfg_path):
+            print_rank_0(f"[warn] config.json not found under {output_dir}; skip dtype rewrite.")
+            return
+        try:
+            with open(cfg_path, encoding="utf-8") as f:
+                data = json.load(f)
+            # Prefer 'dtype', else fall back to 'torch_dtype'
+            key_to_update = (
+                "dtype" if "dtype" in data else ("torch_dtype" if "torch_dtype" in data else None)
+            )
+            if key_to_update is None:
+                print_rank_0(
+                    "[warn] Neither 'dtype' nor 'torch_dtype' present in config.json; skip dtype rewrite."
+                )
+                return
+            if data.get(key_to_update) != dtype_str:
+                data[key_to_update] = dtype_str
+                with open(cfg_path, "w", encoding="utf-8") as f:
+                    json.dump(data, f, ensure_ascii=False, indent=2)
+                print_rank_0(f'Updated config.json: {key_to_update} -> "{dtype_str}"')
+        except Exception as e:
+            print_rank_0(f"[warn] Failed to update dtype in config.json: {e}")

From e758d17d29bda86f53447b685c55c14b1a1ea28a Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Wed, 3 Dec 2025 08:20:27 +0000
Subject: [PATCH 4/9] refine docs.

---
 .../llm_qat/accelerate_config/fsdp2.yaml      | 25 +++++++++++++++++++
 .../quantization/llm_qat/main.py              | 11 ++------
 .../quantization/llm_qat/utils.py             |  2 +-
 3 files changed, 28 insertions(+), 10 deletions(-)
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/accelerate_config/fsdp2.yaml

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/accelerate_config/fsdp2.yaml b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/accelerate_config/fsdp2.yaml
new file mode 100644
index 00000000000..3c901d61760
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/accelerate_config/fsdp2.yaml
@@ -0,0 +1,25 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_activation_checkpointing: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: true
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  fsdp_version: 2
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: gpu
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py
index e37a46f1451..4d874cb81e1 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py
@@ -179,15 +179,8 @@ def train():
         metrics = get_metrics_with_perplexity(metrics)
         logger.info(f"Evaluation results: \n{metrics}")
 
-    if training_args.do_train and quant_args.quant_scheme is None:
-        logger.info("Saving the model...")
-        trainer.save_model(training_args.output_dir)
-    elif quant_args.quant_scheme is not None:
-        from neural_compressor.torch.export.export_hf import export_hf2compressored_model
-        # export quantized model for vllm inference using llm-compressor and compressed_tensor
-        export_hf2compressored_model(model, training_args.output_dir, quant_args.quant_scheme)
-        if tokenizer is not None:
-            tokenizer.save_pretrained(training_args.output_dir)
+    logger.info("Saving the model...")
+    trainer.save_model(training_args.output_dir)
 
 
 if __name__ == "__main__":
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/utils.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/utils.py
index 9c46874f13e..81ced7b2cff 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/utils.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/utils.py
@@ -17,7 +17,7 @@
 import types
 from contextlib import contextmanager
 from functools import partial
-
+import json
 import datasets
 import torch
 

From 64cd6b7bcc4e7445afd83944fbe16ae6f3a02898 Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Wed, 3 Dec 2025 08:40:06 +0000
Subject: [PATCH 5/9] refine doc.

---
 .../quantization/llm_qat/README.md            | 35 +++----------------
 .../quantization/llm_qat/main.py              |  2 +-
 .../llm_qat/quantize_autoround.py             | 10 ++++++
 3 files changed, 15 insertions(+), 32 deletions(-)
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/quantize_autoround.py

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/README.md
index 2e40d5ba14c..19eb1513e92 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/README.md
@@ -54,43 +54,16 @@ accelerate launch --config-file accelerate_config/fsdp1.yaml \
 
 ##### Step 2: 
 
-Quantize the trained model using `prepare_qat()` by setting the following flags `--quant_scheme MXFP8 --do_train False`. This inserts fake quantization modules into the model without starting training yet. Then save the model directly to a get post training quantization model.
+Save the model directly to a get post training quantization model with using [auto-round](https://github.com/intel/auto-round).
 
 
 ```
-accelerate launch --config-file accelerate_config/fsdp1.yaml \
-  --fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer \
-  main.py \
-  --model_name_or_path ./llama3.1-finetuned \
-  --model_max_length 4096 \
-  --dataloader_drop_last True \
-  --do_train False \
-  --do_eval False \
-  --quant_scheme MXFP8 \
-  --output_dir ./llama3.1-finetuned-ptq \
-  --dataset Daring-Anteater \
-  --num_train_epochs 2.0 \
-  --per_device_train_batch_size 4 \
-  --per_device_eval_batch_size 4 \
-  --gradient_accumulation_steps 1 \
-  --eval_accumulation_steps 1 \
-  --save_strategy steps \
-  --save_steps 3000 \
-  --eval_strategy steps \
-  --eval_steps 3000 \
-  --load_best_model_at_end True \
-  --save_total_limit 2 \
-  --learning_rate 1e-5 \
-  --weight_decay 0.0 \
-  --warmup_ratio 0.1 \
-  --lr_scheduler_type linear \
-  --logging_steps 1 \
-  --report_to tensorboard
+python quantize_autoround.py 
 ```
 
 ##### Step 3: 
 
-Train/fine-tune the quantized model with a small learning rate, e.g. 1e-5 for Adam optimizer by setting `--quant_scheme MXFP8 --do_train True`
+Train/fine-tune the quantized model with a small learning rate, e.g. 1e-5 for Adam optimizer by setting `--quant_scheme MXFP4 --do_train True`
 
 ```
 accelerate launch --config-file accelerate_config/fsdp1.yaml \
@@ -101,7 +74,7 @@ accelerate launch --config-file accelerate_config/fsdp1.yaml \
   --dataloader_drop_last True \
   --do_train True \
   --do_eval True \
-  --quant_scheme MXFP8 \
+  --quant_scheme MXFP4 \
   --output_dir ./llama3.1-finetuned-qat \
   --dataset Daring-Anteater \
   --max_steps 1000 \
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py
index 4d874cb81e1..2d8a0a4bd7e 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/main.py
@@ -47,7 +47,7 @@ class TrainingArguments(transformers.TrainingArguments):
 class DataArguments:
     dataset: str = field(
         default="Daring-Anteater",
-        metadata={"help": "Specify the dataset.", "choices": ["Daring-Anteater"]},
+        metadata={"help": "Specify the dataset.", "choices": ["Daring-Anteater", "cnn_dailymail"]},
     )
     train_size: int = field(
         default=0,
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/quantize_autoround.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/quantize_autoround.py
new file mode 100644
index 00000000000..bb4f846df83
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm_qat/quantize_autoround.py
@@ -0,0 +1,10 @@
+
+from auto_round import AutoRound
+
+model_name_or_path = "./llama3.1-finetuned"
+output_dir = "./Llama-3.1-8B-Instruct_autoround_rtn_mxfp4"
+
+# Available schemes: "W2A16", "W3A16", "W4A16", "W8A16", "NVFP4", "MXFP4" (no real kernels), "GGUF:Q4_K_M", etc.
+ar = AutoRound(model_name_or_path, scheme="MXFP4", iters=0)
+
+ar.quantize_and_save(output_dir=output_dir, format="llm_compressor")

From 8ae016a79b18fcdcec78625e6c5d7a2a26a74c1e Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Wed, 3 Dec 2025 08:44:52 +0000
Subject: [PATCH 6/9] add ut.

---
 test/3x/torch/algorithms/qat/test_qat.py | 25 ++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/test/3x/torch/algorithms/qat/test_qat.py b/test/3x/torch/algorithms/qat/test_qat.py
index 83fc1dd4348..f5443d5e13c 100644
--- a/test/3x/torch/algorithms/qat/test_qat.py
+++ b/test/3x/torch/algorithms/qat/test_qat.py
@@ -65,3 +65,28 @@ def test_train():
     for name, param in model.named_parameters():
         assert param.grad is not None
     optimizer.step()
+
+
+def test_train_mxfp4():
+    """QAT test."""
+    setup_seed(20)
+
+    model = TinyModel()
+    mappings = {torch.nn.Linear: "MXFP4"}
+    prepare_qat(model, mappings)
+
+    inp = torch.randn([2, 32])
+
+    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
+
+    with torch.autocast(device_type="cpu", dtype=torch.bfloat16):
+        output = model(inp)
+        loss = output.mean()
+
+    optimizer.zero_grad()
+    loss.backward()
+
+    # check the grad
+    for name, param in model.named_parameters():
+        assert param.grad is not None
+    optimizer.step()

From 9d7b4a2cd4d2c369841e91c12b2138f50f2cda23 Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Wed, 3 Dec 2025 08:55:42 +0000
Subject: [PATCH 7/9] replace fp4 packing func.

---
 .../torch/algorithms/qat/mxfp4_packing.py     | 38 -------------------
 .../torch/algorithms/qat/tensor_quantizer.py  |  6 +--
 2 files changed, 2 insertions(+), 42 deletions(-)
 delete mode 100644 neural_compressor/torch/algorithms/qat/mxfp4_packing.py

diff --git a/neural_compressor/torch/algorithms/qat/mxfp4_packing.py b/neural_compressor/torch/algorithms/qat/mxfp4_packing.py
deleted file mode 100644
index 1d3f10a9020..00000000000
--- a/neural_compressor/torch/algorithms/qat/mxfp4_packing.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-E2M1_max = 6.0
-
-E2M1_values = [0, 0.5, 1, 1.5, 2, 3, 4, 6]
-E2M1_bounds = torch.tensor([0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5])
-
-
-def cast_fp4(x):
-    sign = torch.sign(x)
-    sign_bit = (2 - sign) // 2
-    ord_ = torch.sum((x.abs().unsqueeze(-1) - E2M1_bounds.to(x.device)) > 0, dim=-1)
-    fp4_val = (sign_bit * 0b1000 + ord_).to(torch.uint8)
-    return fp4_val
-
-
-def fuse_uint4_to_uint8(x):
-    # If the last dimension is odd, pad with zeros
-    # If this behavior is not desired, please modify the code accordingly
-    left_side = x[..., 0::2]  # Even indices (0, 2, 4...)
-    right_side = x[..., 1::2]  # Odd indices (1, 3, 5...)
-    new_data = right_side.clone() << 4  # Put odd indices (higher addresses) in high bits
-    new_data[..., : left_side.shape[-1]] += left_side  # Put even indices in low bits
-    return new_data
diff --git a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
index 3405f8ecc45..b270c4878aa 100644
--- a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
+++ b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
@@ -164,10 +164,8 @@ def weight_pack(self, weight, scale):
         if self.data_type == "mx_fp4":
             qweight = weight.reshape(-1, self.block_size) / torch.exp2(scale.float())
 
-            from .mxfp4_packing import cast_fp4, fuse_uint4_to_uint8
-
-            qweight = cast_fp4(qweight)
-            qweight_packed = fuse_uint4_to_uint8(qweight)
+            from auto_round.export.export_to_autoround.qlinear_fp import pack_fp4_to_uint8
+            qweight_packed = pack_fp4_to_uint8(qweight)
 
             e8m0_scale = (scale + 127).to(torch.uint8)
             return qweight_packed.reshape(original_shape[0], original_shape[1] // 2), e8m0_scale.reshape(

From 12a061e038673c08d0474d621c427806183fcd68 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 3 Dec 2025 08:59:01 +0000
Subject: [PATCH 8/9] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/algorithms/qat/tensor_quantizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
index b270c4878aa..2146ab80b7a 100644
--- a/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
+++ b/neural_compressor/torch/algorithms/qat/tensor_quantizer.py
@@ -165,6 +165,7 @@ def weight_pack(self, weight, scale):
             qweight = weight.reshape(-1, self.block_size) / torch.exp2(scale.float())
 
             from auto_round.export.export_to_autoround.qlinear_fp import pack_fp4_to_uint8
+
             qweight_packed = pack_fp4_to_uint8(qweight)
 
             e8m0_scale = (scale + 127).to(torch.uint8)

From bda4d9352b001e58aedb6f547cb27d9f71e602f9 Mon Sep 17 00:00:00 2001
From: lkk <kaokao.lv@intel.com>
Date: Fri, 5 Dec 2025 08:57:51 +0000
Subject: [PATCH 9/9] add more ut.

---
 .../torch/algorithms/qat/test_quant_utils.py  | 11 ++++++++++
 .../qat/test_quantizer_and_linear.py          | 21 +++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/test/3x/torch/algorithms/qat/test_quant_utils.py b/test/3x/torch/algorithms/qat/test_quant_utils.py
index cca51126caf..13ffc89b8c0 100644
--- a/test/3x/torch/algorithms/qat/test_quant_utils.py
+++ b/test/3x/torch/algorithms/qat/test_quant_utils.py
@@ -206,3 +206,14 @@ def test_get_quantization_format_disabled_returns_none(disabled):
         assert fmt is None
     else:
         assert fmt == "MXFP8"
+
+    layer.weight_quantizer = TensorQuantizer(bits=4, data_type="mx_fp4")
+    layer.weight_quantizer._disabled = disabled
+    layer.input_quantizer = TensorQuantizer(bits=4, data_type="mx_fp4")
+    layer.input_quantizer._disabled = disabled
+
+    fmt = quant_utils.get_quantization_format(layer)
+    if disabled:
+        assert fmt is None
+    else:
+        assert fmt == "MXFP4"
diff --git a/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py b/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py
index 8f5c6108ba8..17097ea7bee 100644
--- a/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py
+++ b/test/3x/torch/algorithms/qat/test_quantizer_and_linear.py
@@ -163,3 +163,24 @@ def test_tensor_quantizer_scale_persistence():
     assert tq.scale.dtype == torch.uint8
     # Heuristic: at least one non-zero (if all zero it may still be valid, but improbable)
     assert (tq.scale != 0).any() or (shared_exp == 0).all()
+
+
+def test_weight_pack():
+    # Provide scale_shape so internal buffer is registered & updated
+    tq = TensorQuantizer(scale_shape=(4, 32), block_size=32)
+    x = torch.randn(4, 32)
+    # Use internal fake quant function to generate scale
+    q, shared_exp = tq._fake_quantize(x)
+
+    q_packed, scale = tq.weight_pack(q, shared_exp)
+
+    assert q_packed.dtype == torch.float8_e4m3fn
+
+    tq = TensorQuantizer(data_type="mx_fp4", bits=4, scale_shape=(4, 32), block_size=32)
+    x = torch.randn(4, 32)
+    # Use internal fake quant function to generate scale
+    q, shared_exp = tq._fake_quantize(x)
+
+    q_packed, scale = tq.weight_pack(q, shared_exp)
+
+    assert q_packed.dtype == torch.uint8