Cortex_M backend: Fuse clamp + hardswish decompostion (pytorch#16016)

AdrianLundell · web-flow · commit fe1bc8a31d28 · 2025-12-02T14:15:09.000+01:00
Adds quantization and fusion of clamp.

This is in turn used to decompose the hardswish operator in two passes,
one clamping the dynamic range before quantization, and one decomposing
the reminder of the operation into a maximum and mul op.

The tests in this patch exposes an issue in the runtime dim_order check
as it cannot differ between channels_last/channels_first for tensors
with C=1 or H=W=1. Therefore the check is removed and added as a TBD
instead.

Additionally fixes per_tensor quantization for conv2d.

---------

Signed-off-by: Adrian Lundell &lt;adrian.lundell@arm.com&gt;
diff --git a/backends/cortex_m/ops/cortex_m_ops_common.h b/backends/cortex_m/ops/cortex_m_ops_common.h
@@ -69,11 +69,7 @@ inline void validate_cmsis_nn_tensor_requirements(
         "Output must have the same sizes as inputs");
   }
 
-  // Dim order consistency
-  ET_CHECK_MSG(
-      executorch::runtime::tensors_have_same_dim_order(input1, input2, output),
-      "Tensors must have same dimension order");
-
+  // TBD (#16032): Validate dim_order
   // TBD: Validate memory alignment (CMSIS-NN requirement)
 }
 
diff --git a/backends/cortex_m/passes/__init__.py b/backends/cortex_m/passes/__init__.py
@@ -4,7 +4,9 @@
 # LICENSE file in the root directory of this source tree.
 
 from .activation_fusion_pass import ActivationFusionPass  # noqa
+from .clamp_hardswish_pass import ClampHardswishPass  # noqa
 from .convert_to_cortex_m_pass import ConvertToCortexMPass  # noqa
+from .decompose_hardswish_pass import DecomposeHardswishPass  # noqa
 from .quantized_op_fusion_pass import QuantizedOpFusionPass  # noqa
 from .replace_quant_nodes_pass import ReplaceQuantNodesPass  # noqa
 from .cortex_m_pass_manager import CortexMPassManager  # noqa  # usort: skip
diff --git a/backends/cortex_m/passes/activation_fusion_pass.py b/backends/cortex_m/passes/activation_fusion_pass.py
@@ -8,6 +8,7 @@
 
 import executorch.backends.cortex_m.ops.operators  # noqa: F401
 from executorch.backends.arm._passes.quant_args import QuantArgs
+from executorch.backends.cortex_m.passes.passes_utils import quantize_val
 
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
@@ -33,16 +34,14 @@ class ActivationFusionPass(ExportPass):
         exir_ops.edge.aten.relu.default,
         exir_ops.edge.aten.hardtanh.default,
         exir_ops.edge.aten.hardsigmoid.default,
+        exir_ops.edge.aten.clamp.default,
     }
 
     FUSE_OPS = {
         exir_ops.edge.aten.linear.default,
         exir_ops.edge.aten.convolution.default,
     }
 
-    def _quantize(self, val, scale, zp, qmin, qmax):
-        return min(max(round(val / scale + zp), qmin), qmax)
-
     def _get_validated_qparams(self, node, input_node):
 
         if "input_qparams" not in input_node.meta or "output_qparams" not in node.meta:
@@ -65,14 +64,26 @@ def _get_validated_qparams(self, node, input_node):
 
         match node.target:
             case exir_ops.edge.aten.relu.default:
-                quantized_min_val = self._quantize(0, scale, zp, qmin, qmax)
+                quantized_min_val = quantize_val(0, scale, zp, qmin, qmax)
                 quantized_max_val = qmax
             case exir_ops.edge.aten.hardtanh.default:
-                quantized_min_val = self._quantize(node.args[1], scale, zp, qmin, qmax)
-                quantized_max_val = self._quantize(node.args[2], scale, zp, qmin, qmax)
+                quantized_min_val = quantize_val(node.args[1], scale, zp, qmin, qmax)
+                quantized_max_val = quantize_val(node.args[2], scale, zp, qmin, qmax)
             case exir_ops.edge.aten.hardsigmoid.default:
-                quantized_min_val = self._quantize(0, scale, zp, qmin, qmax)
-                quantized_max_val = self._quantize(1, scale, zp, qmin, qmax)
+                quantized_min_val = quantize_val(0, scale, zp, qmin, qmax)
+                quantized_max_val = quantize_val(1, scale, zp, qmin, qmax)
+            case exir_ops.edge.aten.clamp.default:
+                quantized_min_val = (
+                    quantize_val(node.args[1], scale, zp, qmin, qmax)
+                    if node.args[1] is not None
+                    else qmin
+                )
+                # Last arg is removed if none, so check length of args here
+                quantized_max_val = (
+                    quantize_val(node.args[2], scale, zp, qmin, qmax)
+                    if len(node.args) == 3
+                    else qmax
+                )
             case _:
                 raise RuntimeError("Unexpected target {node.target}.")
 
diff --git a/backends/cortex_m/passes/clamp_hardswish_pass.py b/backends/cortex_m/passes/clamp_hardswish_pass.py
@@ -0,0 +1,37 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
+from torch.fx.node import Argument
+
+
+class ClampHardswishPass(ExportPass):
+    """
+    Adds a clamp operation before hardswish to ensure input is in the range [-3, inf).
+
+    By doing this before quantization the output range of the preceeding op is minimized,
+    potentially improving accuracy.
+    """
+
+    def call_operator(
+        self,
+        op: EdgeOpOverload,
+        args: tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if op == torch.ops.aten.hardswish.default:
+            clamped_args = (args[0], -3)
+            clamped_input = super().call_operator(
+                torch.ops.aten.clamp.default, clamped_args, {}, meta
+            )
+            args = (clamped_input,)
+
+        return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
@@ -10,6 +10,7 @@
 
 import torch
 import torch.fx
+from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot
 
 from executorch.backends.transforms.utils import (
@@ -137,7 +138,8 @@ def _get_convolution_replacement(self, node) -> int:
         input_zero_point = node.meta["input_qparams"][0].zp
         weight_scales = node.meta["input_qparams"][1].scale
         if not isinstance(weight_scales, list):
-            weight_scales = [weight_scales] * weight.data.shape[0]
+            weight_tensor = get_first_fake_tensor(weight)
+            weight_scales = [weight_scales] * weight_tensor.shape[0]
 
         output_qparams = node.meta["output_qparams"][0]
         output_scale = output_qparams.scale
diff --git a/backends/cortex_m/passes/cortex_m_pass_manager.py b/backends/cortex_m/passes/cortex_m_pass_manager.py
@@ -12,7 +12,9 @@
 )
 from executorch.backends.cortex_m.passes import (
     ActivationFusionPass,
+    ClampHardswishPass,
     ConvertToCortexMPass,
+    DecomposeHardswishPass,
     QuantizedOpFusionPass,
     ReplaceQuantNodesPass,
 )
@@ -31,14 +33,16 @@ class CortexMPassManager(PassManager):
         FoldAndAnnotateQParamsPass,
         ReplaceScalarWithTensorArgPass,
         ReplaceQuantNodesPass,
-        QuantizedOpFusionPass,
         ActivationFusionPass,
+        DecomposeHardswishPass,
+        QuantizedOpFusionPass,
         ConvertToCortexMPass,
     ]
 
     pass_list_transform_for_annotation: list[ExportPass] = [
         ScalarsToAttributePass,
         ReplaceScalarWithTensorArgPass,
+        ClampHardswishPass,
     ]
 
     def __init__(self, exported_program, passes=None):
diff --git a/backends/cortex_m/passes/decompose_hardswish_pass.py b/backends/cortex_m/passes/decompose_hardswish_pass.py
@@ -0,0 +1,127 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import logging
+
+import executorch.backends.cortex_m.ops.operators  # noqa: F401
+
+import torch
+from executorch.backends.arm._passes.quant_args import QuantArgs
+
+from executorch.backends.cortex_m.passes.passes_utils import quantize_val
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+from torch.fx import GraphModule, Node
+from torch.fx.passes.infra.pass_manager import PassResult
+
+logger = logging.getLogger(__name__)
+
+
+class DecomposeHardswishPass(ExportPass):
+    """
+    Decomposes hardswish like
+
+        hardswish(x) = x * (clamp(x, -3, 3) + 3)/6
+
+    where the add and division is implemented by modifying the quantization parameters similar
+    to hardsigmoid in the activation_fusion_pass. Note that this pass assumes
+    that the output range of the preceding op is already clamped to [-3, inf] during
+    quantization by the clamp_hardswish_pass, removing the need for the negative clamp.
+    """
+
+    TARGETS = {
+        exir_ops.edge.aten.hardswish.default,
+    }
+
+    FUSE_OPS = {
+        exir_ops.edge.aten.linear.default,
+        exir_ops.edge.aten.convolution.default,
+    }
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        modified = False
+        nodes_to_erase: list[Node] = []
+
+        for node in list(graph_module.graph.nodes):
+            if node.op != "call_function" or node.target not in self.TARGETS:
+                continue
+
+            input_node = node.args[0]
+            if (
+                input_node.op != "call_function"
+                or input_node.target not in self.FUSE_OPS
+            ):
+                logger.warning(
+                    f"Cannot fuse activation {node.name} as input node {input_node.name} is not a supported fused activation op."
+                )
+                continue
+            if len(input_node.users.values()) > 1:
+                logger.warning(
+                    f"Cannot fuse activation {node.name} as input node {input_node.name} has multiple users."
+                )
+                continue
+
+            input_quant_dict = input_node.meta.get("output_qparams", [None])[
+                0
+            ]._asdict()
+            scale = input_quant_dict["scale"]
+            zero_point = input_quant_dict["zp"]
+            qmin = input_quant_dict["qmin"]
+            qmax = input_quant_dict["qmax"]
+
+            # Create min node
+            with graph_module.graph.inserting_after(input_node):
+                clamp_node = graph_module.graph.create_node(
+                    "call_function",
+                    target=exir_ops.edge.aten.minimum.default,
+                    args=(
+                        input_node,
+                        torch.tensor(
+                            quantize_val(3, scale, zero_point, qmin, qmax),
+                            dtype=torch.int8,
+                        ),
+                    ),
+                    kwargs={},
+                )
+                clamp_node.meta = input_node.meta.copy()
+
+            # Create mul node
+            with graph_module.graph.inserting_after(clamp_node):
+                mul_node = graph_module.graph.create_node(
+                    "call_function",
+                    target=exir_ops.edge.aten.mul.Tensor,
+                    args=(input_node, clamp_node),
+                    kwargs={},
+                )
+                mul_node.meta = node.meta.copy()
+
+            mul_quant_dict = node.meta["input_qparams"][0]._asdict()
+
+            mul_quant_dict_shifted = mul_quant_dict.copy()
+            mul_quant_dict_shifted["zp"] = mul_quant_dict_shifted["zp"] - round(
+                3 / (mul_quant_dict_shifted["scale"])
+            )
+
+            output_quant_dict = node.meta["output_qparams"][0]._asdict()
+            output_quant_dict["scale"] = output_quant_dict["scale"] * 6
+
+            node.meta["input_qparams"][0] = QuantArgs(**mul_quant_dict)
+            mul_node.meta["input_qparams"][1] = QuantArgs(**mul_quant_dict_shifted)
+            mul_node.meta["output_qparams"][0] = QuantArgs(**output_quant_dict)
+
+            node.replace_all_uses_with(mul_node)
+            nodes_to_erase.append(node)
+            modified = True
+
+        for node in nodes_to_erase:
+            graph_module.graph.erase_node(node)
+
+        if modified:
+            graph_module.graph.eliminate_dead_code()
+            graph_module.recompile()
+
+        return PassResult(graph_module, modified)
diff --git a/backends/cortex_m/passes/passes_utils.py b/backends/cortex_m/passes/passes_utils.py
@@ -17,6 +17,10 @@
 SHIFT_INT8 = 20
 
 
+def quantize_val(val, scale, zp, qmin, qmax):
+    return min(max(round(val / scale + zp), qmin), qmax)
+
+
 def dequantize_per_tensor_cmsis(
     qtensor: torch.Tensor, zero_point: int, multiplier: int, shift: int
 ) -> torch.Tensor:
diff --git a/backends/cortex_m/quantizer/operator_configs.py b/backends/cortex_m/quantizer/operator_configs.py
@@ -19,6 +19,8 @@
 BINARY_OP_PATTERNS = [
     [torch.ops.aten.add.Tensor],
     [torch.ops.aten.mul.Tensor],
+    [torch.ops.aten.hardswish.default],
+    [torch.ops.aten.hardswish_.default],
 ]
 
 LINEAR_OP_PATTERNS = [
@@ -29,6 +31,8 @@
     [torch.ops.aten.linear.default, torch.ops.aten.hardtanh_.default],
     [torch.ops.aten.linear.default, torch.ops.aten.hardsigmoid.default],
     [torch.ops.aten.linear.default, torch.ops.aten.hardsigmoid_.default],
+    [torch.ops.aten.linear.default, torch.ops.aten.clamp.default],
+    [torch.ops.aten.linear.default, torch.ops.aten.clamp_.default],
 ]
 
 CONV_OP_PATTERNS = [
@@ -39,6 +43,8 @@
     [torch.ops.aten.conv2d.default, torch.ops.aten.hardtanh_.default],
     [torch.ops.aten.conv2d.default, torch.ops.aten.hardsigmoid.default],
     [torch.ops.aten.conv2d.default, torch.ops.aten.hardsigmoid_.default],
+    [torch.ops.aten.conv2d.default, torch.ops.aten.clamp.default],
+    [torch.ops.aten.conv2d.default, torch.ops.aten.clamp_.default],
 ]
 
 # ----------------- OPERATOR CONFIG PRESETS -----------------
diff --git a/backends/cortex_m/test/ops/test_activation.py b/backends/cortex_m/test/ops/test_activation.py
diff --git a/backends/cortex_m/test/tester.py b/backends/cortex_m/test/tester.py

Original file line number	Diff line number	Diff line change
`@@ -69,11 +69,7 @@ inline void validate_cmsis_nn_tensor_requirements(`
`69`	`69`	`"Output must have the same sizes as inputs");`
`70`	`70`	`}`
`71`	`71`
`72`		`- // Dim order consistency`
`73`		`- ET_CHECK_MSG(`
`74`		`- executorch::runtime::tensors_have_same_dim_order(input1, input2, output),`
`75`		`- "Tensors must have same dimension order");`
`76`		`-`
	`72`	`+ // TBD (#16032): Validate dim_order`
`77`	`73`	`// TBD: Validate memory alignment (CMSIS-NN requirement)`
`78`	`74`	`}`
`79`	`75`