pytorch
diff --git a/‎backends/cadence/aot/TARGETS‎
Lines changed: 11 additions & 11 deletions b/‎backends/cadence/aot/TARGETS‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎backends/cadence/aot/functions.yaml‎
Lines changed: 2 additions & 2 deletions b/‎backends/cadence/aot/functions.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/cadence/generic/kernels/kernels.cpp‎
Lines changed: 1 addition & 0 deletions b/‎backends/cadence/generic/kernels/kernels.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/cadence/generic/kernels/kernels.h‎
Lines changed: 3 additions & 2 deletions b/‎backends/cadence/generic/kernels/kernels.h‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎backends/cadence/generic/operators/TARGETS‎
Lines changed: 0 additions & 5 deletions b/‎backends/cadence/generic/operators/TARGETS‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎backends/cadence/generic/operators/cadence_type_util.h‎
Lines changed: 63 additions & 0 deletions b/‎backends/cadence/generic/operators/cadence_type_util.h‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎backends/cadence/generic/operators/op_quantized_add.cpp‎
Lines changed: 216 additions & 0 deletions b/‎backends/cadence/generic/operators/op_quantized_add.cpp‎
Lines changed: 216 additions & 0 deletions
@@ -146,17 +146,17 @@ executorch_generated_lib(
     deps = [
         "//executorch/backends/cadence/generic/kernels:cadence_kernels",
         "//executorch/backends/cadence/generic/operators:op_requantize",
-        "//executorch/backends/cadence/generic/operators:im2row_out",
-        "//executorch/backends/cadence/generic/operators:dequantize_per_tensor",
-        "//executorch/backends/cadence/generic/operators:quantize_per_tensor",
-        "//executorch/backends/cadence/generic/operators:quantized_add_out",
-        "//executorch/backends/cadence/generic/operators:quantized_conv2d_nchw_out",
-        "//executorch/backends/cadence/generic/operators:quantized_conv2d_nhwc_out",
-        "//executorch/backends/cadence/generic/operators:quantized_fully_connected_out",
-        "//executorch/backends/cadence/generic/operators:quantized_layer_norm",
-        "//executorch/backends/cadence/generic/operators:quantized_linear_out",
-        "//executorch/backends/cadence/generic/operators:quantized_matmul_out",
-        "//executorch/backends/cadence/generic/operators:quantized_relu_out",
+        "//executorch/backends/cadence/generic/operators:op_im2row",
+        "//executorch/backends/cadence/generic/operators:op_dequantize_per_tensor",
+        "//executorch/backends/cadence/generic/operators:op_quantize_per_tensor",
+        "//executorch/backends/cadence/generic/operators:op_quantized_add",
+        "//executorch/backends/cadence/generic/operators:op_quantized_conv2d",
+        "//executorch/backends/cadence/generic/operators:op_quantized_conv1d",
+        "//executorch/backends/cadence/generic/operators:op_quantized_fully_connected",
+        "//executorch/backends/cadence/generic/operators:op_quantized_layer_norm",
+        "//executorch/backends/cadence/generic/operators:op_quantized_linear",
+        "//executorch/backends/cadence/generic/operators:op_quantized_matmul",
+        "//executorch/backends/cadence/generic/operators:op_quantized_relu",
         "//executorch/kernels/portable:executorch_all_ops",
         "//executorch/kernels/portable:operators",
     ],
 
@@ -349,12 +349,12 @@
     - arg_meta: null
       kernel_name: impl::generic::im2row_per_tensor_out
 
-- func: cadence::quantized_conv2d_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
       kernel_name: impl::generic::quantized_conv2d_nchw_per_tensor_out
 
-- func: cadence::quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
       kernel_name: impl::generic::quantized_conv2d_nhwc_per_tensor_out
 
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/backends/cadence/generic/kernels/kernels.h>
+
 #include <algorithm>
 #include <cmath>
 #include <cstring>
 
@@ -6,8 +6,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "inttypes.h"
-#include "stddef.h"
+#include <stddef.h>
+
+#include <cstdint>
 
 namespace impl {
 namespace generic {
 
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+/**
+ * @file cadence_type_util.h
+ * @brief Common type macros for Cadence quantized operators
+ *
+ * This header provides utility macros for iterating over supported quantized
+ * data types in Cadence operators. These macros are used with switch statements
+ * to dispatch to type-specific implementations.
+ */
+
+/**
+ * Macro to iterate over standard Cadence quantized types (uint8_t, int8_t)
+ *
+ * Usage:
+ *   ET_FORALL_CADENCE_QUANTIZED_TYPES(MACRO)
+ *
+ * Where MACRO is defined as: #define MACRO(ctype, name) ...
+ * - ctype: C++ type (uint8_t or int8_t)
+ * - name: ExecutorTorch ScalarType name suffix (Byte or Char)
+ *
+ * Example:
+ *   #define HANDLE_TYPE(ctype, name) \
+ *     case ScalarType::name: \
+ *       return process<ctype>(tensor); \
+ *       break;
+ *
+ *   ScalarType dtype = tensor.scalar_type();
+ *   switch (dtype) {
+ *     ET_FORALL_CADENCE_QUANTIZED_TYPES(HANDLE_TYPE)
+ *     default:
+ *       ET_CHECK_MSG(false, "Unsupported dtype");
+ *   }
+ */
+#define ET_FORALL_CADENCE_QUANTIZED_TYPES(_) \
+  _(uint8_t, Byte)                           \
+  _(int8_t, Char)
+
+/**
+ * Macro to iterate over extended Cadence quantized types including int16_t
+ *
+ * Usage:
+ *   ET_FORALL_CADENCE_QUANTIZED_TYPES_WITH_INT16(MACRO)
+ *
+ * Where MACRO is defined as: #define MACRO(ctype, name) ...
+ * - ctype: C++ type (uint8_t, int8_t, or int16_t)
+ * - name: ExecutorTorch ScalarType name suffix (Byte, Char, or Short)
+ *
+ * This macro includes int16_t support for operators that can handle 16-bit
+ * quantized values (e.g., quantized_linear, quantized_fully_connected).
+ */
+#define ET_FORALL_CADENCE_QUANTIZED_TYPES_WITH_INT16(_) \
+  _(uint8_t, Byte)                                      \
+  _(int8_t, Char)                                       \
+  _(int16_t, Short)
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/generic/operators/op_quantized_add.h>
+
+#include <executorch/backends/cadence/generic/kernels/kernels.h>
+#include <executorch/backends/cadence/generic/operators/quantized_op_macros.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+
+namespace impl::generic::native {
+
+using ::executorch::aten::Scalar;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+using ::impl::generic::kernels::dequantize;
+using ::impl::generic::kernels::quantize;
+
+DECLARE_POINTWISE_TENSOR_QUANTIZED_BINARY_OP(quantized_add_, +);
+
+#define DECLARE_POINTWISE_SCALAR_QUANTIZED_BINARY_OP(BINARY_FUNC_NAME, OP) \
+  template <typename T>                                                    \
+  void BINARY_FUNC_NAME(                                                   \
+      const Tensor& X,                                                     \
+      float X_scale,                                                       \
+      int32_t X_zero_point,                                                \
+      const float Y,                                                       \
+      float out_scale,                                                     \
+      int32_t out_zero_point,                                              \
+      Tensor& out) {                                                       \
+    const T* __restrict__ X_data = X.const_data_ptr<T>();                  \
+    T* __restrict__ out_data = out.mutable_data_ptr<T>();                  \
+    float inv_out_scale = 1.0f / out_scale;                                \
+    for (size_t i = 0, e = X.numel(); i < e; ++i) {                        \
+      float x = dequantize<T>(X_data[i], X_scale, X_zero_point);           \
+      float z = x OP Y;                                                    \
+      out_data[i] = quantize<T>(z, inv_out_scale, out_zero_point);         \
+    }                                                                      \
+  }
+
+DECLARE_POINTWISE_SCALAR_QUANTIZED_BINARY_OP(quantized_add_Scalar_, +);
+
+Tensor& quantized_add_out(
+    ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& X,
+    const Tensor& X_scale_t,
+    const Tensor& X_zero_point_t,
+    const Tensor& Y,
+    const Tensor& Y_scale_t,
+    const Tensor& Y_zero_point_t,
+    double out_scale,
+    int64_t out_zero_point,
+    Tensor& out) {
+  float X_scale = X_scale_t.const_data_ptr<float>()[0];
+  int32_t X_zero_point = X_zero_point_t.const_data_ptr<int32_t>()[0];
+  float Y_scale = Y_scale_t.const_data_ptr<float>()[0];
+  int32_t Y_zero_point = Y_zero_point_t.const_data_ptr<int32_t>()[0];
+
+#define typed_quantized_add(ctype, dtype)     \
+  case ScalarType::dtype: {                   \
+    quantized_add_<ctype>(                    \
+        X,                                    \
+        X_scale,                              \
+        X_zero_point,                         \
+        Y,                                    \
+        Y_scale,                              \
+        Y_zero_point,                         \
+        static_cast<float>(out_scale),        \
+        static_cast<int32_t>(out_zero_point), \
+        out);                                 \
+    break;                                    \
+  }
+
+  ScalarType dtype = out.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_add);
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+#undef typed_quantized_add
+
+  return out;
+}
+
+Tensor& quantized_add_per_tensor_out(
+    ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& X,
+    double X_scale,
+    int64_t X_zero_point,
+    const Tensor& Y,
+    double Y_scale,
+    int64_t Y_zero_point,
+    double out_scale,
+    int64_t out_zero_point,
+    Tensor& out) {
+#define typed_quantized_add(ctype, dtype)     \
+  case ScalarType::dtype: {                   \
+    quantized_add_<ctype>(                    \
+        X,                                    \
+        static_cast<float>(X_scale),          \
+        static_cast<int32_t>(X_zero_point),   \
+        Y,                                    \
+        static_cast<float>(Y_scale),          \
+        static_cast<int32_t>(Y_zero_point),   \
+        static_cast<float>(out_scale),        \
+        static_cast<int32_t>(out_zero_point), \
+        out);                                 \
+    break;                                    \
+  }
+
+  ScalarType dtype = out.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_add);
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+#undef typed_quantized_add
+  return out;
+}
+
+Tensor& quantized_add_asym8sxasym8s_asym8s_per_tensor_out(
+    ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& X,
+    double X_scale,
+    int64_t X_zero_point,
+    const Tensor& Y,
+    double Y_scale,
+    int64_t Y_zero_point,
+    double out_scale,
+    int64_t out_zero_point,
+    Tensor& out) {
+  quantized_add_<int8_t>(
+      X,
+      static_cast<float>(X_scale),
+      static_cast<int32_t>(X_zero_point),
+      Y,
+      static_cast<float>(Y_scale),
+      static_cast<int32_t>(Y_zero_point),
+      static_cast<float>(out_scale),
+      static_cast<int32_t>(out_zero_point),
+      out);
+  return out;
+}
+
+Tensor& quantized_add_asym8uxasym8u_asym8u_per_tensor_out(
+    ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& X,
+    double X_scale,
+    int64_t X_zero_point,
+    const Tensor& Y,
+    double Y_scale,
+    int64_t Y_zero_point,
+    double out_scale,
+    int64_t out_zero_point,
+    Tensor& out) {
+  quantized_add_<uint8_t>(
+      X,
+      static_cast<float>(X_scale),
+      static_cast<int32_t>(X_zero_point),
+      Y,
+      static_cast<float>(Y_scale),
+      static_cast<int32_t>(Y_zero_point),
+      static_cast<float>(out_scale),
+      static_cast<int32_t>(out_zero_point),
+      out);
+  return out;
+}
+
+Tensor& quantized_add_Scalar_out(
+    ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& X,
+    const Tensor& X_scale_t,
+    const Tensor& X_zero_point_t,
+    const Scalar& Y_scalar,
+    double out_scale,
+    int64_t out_zero_point,
+    Tensor& out) {
+  float X_scale = X_scale_t.const_data_ptr<float>()[0];
+  int32_t X_zero_point = X_zero_point_t.const_data_ptr<int32_t>()[0];
+  float Y = static_cast<float>(
+      ::torch::executor::native::utils::scalar_to<double>(Y_scalar));
+#define typed_quantized_add_Scalar(ctype, dtype) \
+  case ScalarType::dtype: {                      \
+    quantized_add_Scalar_<ctype>(                \
+        X,                                       \
+        X_scale,                                 \
+        X_zero_point,                            \
+        Y,                                       \
+        static_cast<float>(out_scale),           \
+        static_cast<int32_t>(out_zero_point),    \
+        out);                                    \
+    break;                                       \
+  }
+
+  ScalarType dtype = out.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_add_Scalar)
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+#undef typed_quantized_add_Scalar
+  return out;
+}
+
+#undef DECLARE_POINTWISE_SCALAR_QUANTIZED_BINARY_OP
+
+} // namespace impl::generic::native