[ET-VK] Enable and test texture IO for quantized convolution ops (#16082)

SS-JIA · ssjia · web-flow · commit 77d9e9720a94 · 2025-12-04T20:15:54.000-05:00
Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom): * __->__ #16082 * #16079 Title says it all! Using texture3d as the input/output storage type may allow for additional optimizations re: bounds checking. Differential Revision: [D88395020](https://our.internmc.facebook.com/intern/diff/D88395020/) --------- Co-authored-by: ssjia <ssjia@devvm1479.ncg0.facebook.com>
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_output_tile_store.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_output_tile_store.glslh
@@ -16,7 +16,7 @@
 
 void store_packed_int8_output_tile(
     const Int8OutTile int8_tile,
-    const Conv2dBlockIndex block_idx,
+    Conv2dBlockIndex block_idx,
     const Conv2dBlockExtents block_extents) {
 #ifdef PACKED_INT8_OUTPUT_BUFFER
   [[unroll]] for (int m4 = 0; m4 < TILE_M4; m4++) {
@@ -34,8 +34,11 @@ void store_packed_int8_output_tile(
     [[unroll]] for (int n4 = 0; n4 < TILE_N4; n4++) {
       if (block_idx.data.x + m4 < block_extents.data.x &&
           block_idx.data.z + n4 < block_extents.data.z) {
+        const ivec3 idx_offset = ivec3(m4, 0, n4);
         imageStore(
-            t_packed_int8_output, block_idx.data, int8_tile.data[m4][n4]);
+            t_packed_int8_output,
+            block_idx.data + idx_offset,
+            int8_tile.data[m4][n4]);
       }
     }
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to.yaml
@@ -14,6 +14,7 @@ conv2d_q8ta_q8csw_q8to:
       parameter_names: [IO_STORAGE, WEIGHT_STORAGE]
       combos:
         - parameter_values: [buffer, texture2d]
+        - parameter_values: [texture3d, texture2d]
     DTYPE:
       - VALUE: float
   shader_variants:
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to_linear_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to_linear_tiled.yaml
@@ -14,6 +14,7 @@ conv2d_q8ta_q8csw_q8to_linear_tiled:
       parameter_names: [IO_STORAGE, WEIGHT_STORAGE]
       combos:
         - parameter_values: [buffer, texture2d]
+        - parameter_values: [texture3d, texture2d]
     DTYPE:
       - VALUE: float
   shader_variants:
diff --git a/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8.yaml b/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8.yaml
@@ -10,5 +10,6 @@ im2col_packed_int8:
   generate_variant_forall:
     STORAGE:
       - VALUE: buffer
+      - VALUE: texture3d
   shader_variants:
     - NAME: im2col_packed_int8
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_tile.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_tile.glslh
@@ -31,13 +31,13 @@ void printInt8InputTile(const Int8InputTile tile) {
 
   [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) {
     [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
-      debugPrintfEXT("  tile[%d][%d] (ivec4): ", m4, k4);
+      debugPrintfEXT("  tile[%d][%d]:\\n", m4, k4);
 
       // Each ivec4 contains 4 packed integers, each integer contains 4 8-bit
       // values
       [[unroll]] for (int vec_idx = 0; vec_idx < 4; ++vec_idx) {
         int packed_int = tile.data[m4][k4][vec_idx];
-        debugPrintfEXT("packed_int[%d]=%d -> [", vec_idx, packed_int);
+        debugPrintfEXT("    [", vec_idx, packed_int);
 
         // Extract 4 8-bit values from this packed integer
         [[unroll]] for (int byte_idx = 0; byte_idx < 4; ++byte_idx) {
@@ -48,6 +48,7 @@ void printInt8InputTile(const Int8InputTile tile) {
             debugPrintfEXT("%d] ", val);
           }
         }
+        debugPrintfEXT("(packed=%d)\\n", packed_int);
       }
       debugPrintfEXT("\\n");
     }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_output_tile.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_output_tile.glslh
@@ -35,17 +35,17 @@ void initialize(out Int8OutTile tile) {
 
 void printInt8OutTile(const Int8OutTile tile) {
   debugPrintfEXT(
-      "Int8InputTile [TILE_M4=%d][TILE_N4=%d]:\\n", TILE_M4, TILE_N4);
+      "Int8OutTile [TILE_M4=%d][TILE_N4=%d]:\\n", TILE_M4, TILE_N4);
 
   [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) {
     [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-      debugPrintfEXT("  tile[%d][%d] (ivec4): ", m4, n4);
+      debugPrintfEXT("  tile[%d][%d]:\\n", m4, n4);
 
       // Each ivec4 contains 4 packed integers, each integer contains 4 8-bit
       // values
       [[unroll]] for (int vec_idx = 0; vec_idx < 4; ++vec_idx) {
         int packed_int = tile.data[m4][n4][vec_idx];
-        debugPrintfEXT("packed_int[%d]=%d -> [", vec_idx, packed_int);
+        debugPrintfEXT("    [", vec_idx, packed_int);
 
         // Extract 4 8-bit values from this packed integer
         [[unroll]] for (int byte_idx = 0; byte_idx < 4; ++byte_idx) {
@@ -56,6 +56,7 @@ void printInt8OutTile(const Int8OutTile tile) {
             debugPrintfEXT("%d] ", val);
           }
         }
+        debugPrintfEXT("(packed=%d)\\n", packed_int);
       }
       debugPrintfEXT("\\n");
     }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_4w4c.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_4w4c.yaml
@@ -14,6 +14,7 @@ quantize_and_pack_4w4c:
       parameter_names: [OUTPUT_STORAGE, INPUT_STORAGE]
       combos:
         - parameter_values: [texture3d, texture3d]
+        - parameter_values: [texture3d, buffer]
         - parameter_values: [buffer, texture3d]
         - parameter_values: [buffer, buffer]
     DTYPE:
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unpack_4w4c_and_dequantize.yaml b/backends/vulkan/runtime/graph/ops/glsl/unpack_4w4c_and_dequantize.yaml
@@ -14,6 +14,7 @@ unpack_4w4c_and_dequantize:
       parameter_names: [OUTPUT_STORAGE, INPUT_STORAGE]
       combos:
         - parameter_values: [texture3d, texture3d]
+        - parameter_values: [buffer, texture3d]
         - parameter_values: [texture3d, buffer]
         - parameter_values: [buffer, buffer]
     DTYPE:
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
@@ -1400,7 +1400,7 @@ void static_quantized_conv2d_impl(
         &graph,
         input_im2col_sizes,
         vkapi::kInt8x4,
-        utils::kBuffer,
+        graph.storage_type_of(packed_int8_input),
         utils::kPackedInt8_4W4C);
 
     packed_int8_input_im2col = packed_int8_input_im2col_tensor.vref;
@@ -1492,7 +1492,8 @@ void conv2d_q8ta_q8csw_q8to(
 
 void conv2d_q8ta_q8csw_q8to_test(
     ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
+    const std::vector<ValueRef>& args,
+    utils::StorageType io_storage_type) {
   int32_t idx = 0;
   const ValueRef fp_input = args.at(idx++);
   const ValueRef input_scale = args.at(idx++);
@@ -1514,14 +1515,14 @@ void conv2d_q8ta_q8csw_q8to_test(
       &graph,
       graph.sizes_of(fp_input),
       vkapi::kInt8x4,
-      utils::kBuffer,
+      io_storage_type,
       utils::kPackedInt8_4W4C);
 
   TmpTensor packed_int8_output(
       &graph,
       graph.sizes_of(fp_output),
       vkapi::kInt8x4,
-      utils::kBuffer,
+      io_storage_type,
       utils::kPackedInt8_4W4C);
 
   add_quantize_and_pack_4w4c_node(
@@ -1550,10 +1551,27 @@ void conv2d_q8ta_q8csw_q8to_test(
       graph, packed_int8_output, output_scale, output_zp, fp_output);
 }
 
+void conv2d_q8ta_q8csw_q8to_test_buffer(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  conv2d_q8ta_q8csw_q8to_test(graph, args, utils::kBuffer);
+}
+
+void conv2d_q8ta_q8csw_q8to_test_texture(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  conv2d_q8ta_q8csw_q8to_test(graph, args, utils::kBuffer);
+}
+
 REGISTER_OPERATORS {
   VK_REGISTER_OP(et_vk.conv2d_q8ta_q8csw.default, conv2d_q8ta_q8csw);
   VK_REGISTER_OP(et_vk.conv2d_q8csw.default, conv2d_q8csw);
-  VK_REGISTER_OP(etvk.conv2d_q8ta_q8csw_q8to.test, conv2d_q8ta_q8csw_q8to_test);
+  VK_REGISTER_OP(
+      etvk.conv2d_q8ta_q8csw_q8to.test_texture,
+      conv2d_q8ta_q8csw_q8to_test_texture);
+  VK_REGISTER_OP(
+      etvk.conv2d_q8ta_q8csw_q8to.test_buffer,
+      conv2d_q8ta_q8csw_q8to_test_buffer);
   VK_REGISTER_OP(et_vk.conv2d_q8ta_q8csw_q8to.default, conv2d_q8ta_q8csw_q8to);
   VK_REGISTER_OP(
       et_vk.conv2d_q8ta_q8csw_q8to_dw.default, conv2d_q8ta_q8csw_q8to);
diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp
@@ -13,6 +13,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
+// #define DEBUG_MODE
+
 using namespace executorch::vulkan::prototyping;
 
 using namespace vkcompute;
@@ -23,7 +25,8 @@ static constexpr int64_t kRefDimSizeLimit = 100;
 TestCase create_test_case_from_config(
     const Conv2dConfig& config,
     utils::StorageType storage_type,
-    vkapi::ScalarType input_dtype) {
+    vkapi::ScalarType input_dtype,
+    utils::StorageType interm_storage_type) {
   TestCase test_case;
 
   // Create a descriptive name for the test case
@@ -35,8 +38,15 @@ TestCase create_test_case_from_config(
       config.test_case_name + "_" + storage_str + "_" + dtype_str;
   test_case.set_name(test_name);
 
+  std::string operator_suffix = ".test";
+  if (interm_storage_type == utils::kTexture3D) {
+    operator_suffix += "_texture";
+  } else {
+    operator_suffix += "_buffer";
+  }
+
   // Set the operator name for the test case
-  std::string operator_name = "etvk." + config.op_name + ".test";
+  std::string operator_name = "etvk." + config.op_name + operator_suffix;
   test_case.set_operator_name(operator_name);
 
   // Calculate output dimensions
@@ -56,7 +66,12 @@ TestCase create_test_case_from_config(
       input_dtype,
       storage_type,
       io_memory_layout,
-      DataGenType::RANDOM);
+#ifdef DEBUG_MODE
+      DataGenType::RANDOM
+#else
+      DataGenType::RANDOM
+#endif
+  );
 
   if (debugging()) {
     print_valuespec_data(input_tensor, "input_tensor");
@@ -193,8 +208,10 @@ std::vector<TestCase> generate_quantized_conv2d_easy_cases() {
   // Generate test cases for each combination
   for (const auto& storage_type : storage_types) {
     for (const auto& input_dtype : float_types) {
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, input_dtype));
+      test_cases.push_back(create_test_case_from_config(
+          config, storage_type, input_dtype, utils::kBuffer));
+      test_cases.push_back(create_test_case_from_config(
+          config, storage_type, input_dtype, utils::kTexture3D));
     }
   }
 
@@ -373,8 +390,10 @@ std::vector<TestCase> generate_quantized_conv2d_test_cases() {
       if (vkcompute::api::context()
               ->adapter_ptr()
               ->supports_int8_dot_product()) {
-        test_cases.push_back(
-            create_test_case_from_config(config, storage_type, vkapi::kFloat));
+        test_cases.push_back(create_test_case_from_config(
+            config, storage_type, vkapi::kFloat, utils::kBuffer));
+        test_cases.push_back(create_test_case_from_config(
+            config, storage_type, vkapi::kFloat, utils::kTexture3D));
       }
     }
   }
@@ -610,7 +629,11 @@ int64_t quantized_conv2d_flop_calculator(const TestCase& test_case) {
 int main(int argc, char* argv[]) {
   set_debugging(false);
   set_print_output(false);
+#ifdef DEBUG_MODE
+  set_print_latencies(true);
+#else
   set_print_latencies(false);
+#endif
   set_use_gpu_timestamps(true);
 
   print_performance_header();
@@ -623,11 +646,20 @@ int main(int argc, char* argv[]) {
 
   // Execute test cases using the new framework with custom FLOP calculator
   auto results = execute_test_cases(
+#ifdef DEBUG_MODE
+      generate_quantized_conv2d_easy_cases,
+#else
       generate_quantized_conv2d_test_cases,
+#endif
       quantized_conv2d_flop_calculator,
       "QuantizedConv2dQ8ToQ8To",
+#ifdef DEBUG_MODE
+      0,
+      1,
+#else
       3,
       10,
+#endif
       ref_fn);
 
   return 0;
diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp
@@ -24,7 +24,8 @@ static constexpr int64_t kRefDimSizeLimit = 100;
 TestCase create_test_case_from_config(
     const Conv2dConfig& config,
     utils::StorageType storage_type,
-    vkapi::ScalarType input_dtype) {
+    vkapi::ScalarType input_dtype,
+    utils::StorageType interm_storage_type) {
   TestCase test_case;
 
   // Create a descriptive name for the test case
@@ -36,8 +37,15 @@ TestCase create_test_case_from_config(
       config.test_case_name + "_" + storage_str + "_" + dtype_str;
   test_case.set_name(test_name);
 
+  std::string operator_suffix = ".test";
+  if (interm_storage_type == utils::kTexture3D) {
+    operator_suffix += "_texture";
+  } else {
+    operator_suffix += "_buffer";
+  }
+
   // Set the operator name for the test case
-  std::string operator_name = "etvk." + config.op_name + ".test";
+  std::string operator_name = "etvk." + config.op_name + operator_suffix;
   test_case.set_operator_name(operator_name);
 
   // Calculate output dimensions
@@ -202,8 +210,10 @@ std::vector<TestCase> generate_quantized_conv2d_dw_easy_cases() {
   // Generate test cases for each combination
   for (const auto& storage_type : storage_types) {
     for (const auto& input_dtype : float_types) {
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, input_dtype));
+      test_cases.push_back(create_test_case_from_config(
+          config, storage_type, input_dtype, utils::kBuffer));
+      test_cases.push_back(create_test_case_from_config(
+          config, storage_type, input_dtype, utils::kTexture3D));
     }
   }
 
@@ -325,8 +335,10 @@ std::vector<TestCase> generate_quantized_conv2d_dw_test_cases() {
       if (vkcompute::api::context()
               ->adapter_ptr()
               ->supports_int8_dot_product()) {
-        test_cases.push_back(
-            create_test_case_from_config(config, storage_type, vkapi::kFloat));
+        test_cases.push_back(create_test_case_from_config(
+            config, storage_type, vkapi::kFloat, utils::kBuffer));
+        test_cases.push_back(create_test_case_from_config(
+            config, storage_type, vkapi::kFloat, utils::kTexture3D));
       }
     }
   }

Original file line number	Diff line number	Diff line change
`@@ -31,13 +31,13 @@ void printInt8InputTile(const Int8InputTile tile) {`
`31`	`31`
`32`	`32`	`[[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) {`
`33`	`33`	`[[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {`
`34`		`- debugPrintfEXT(" tile[%d][%d] (ivec4): ", m4, k4);`
	`34`	`+ debugPrintfEXT(" tile[%d][%d]:\\n", m4, k4);`
`35`	`35`
`36`	`36`	`// Each ivec4 contains 4 packed integers, each integer contains 4 8-bit`
`37`	`37`	`// values`
`38`	`38`	`[[unroll]] for (int vec_idx = 0; vec_idx < 4; ++vec_idx) {`
`39`	`39`	`int packed_int = tile.data[m4][k4][vec_idx];`
`40`		`- debugPrintfEXT("packed_int[%d]=%d -> [", vec_idx, packed_int);`
	`40`	`+ debugPrintfEXT(" [", vec_idx, packed_int);`
`41`	`41`
`42`	`42`	`// Extract 4 8-bit values from this packed integer`
`43`	`43`	`[[unroll]] for (int byte_idx = 0; byte_idx < 4; ++byte_idx) {`
`@@ -48,6 +48,7 @@ void printInt8InputTile(const Int8InputTile tile) {`
`48`	`48`	`debugPrintfEXT("%d] ", val);`
`49`	`49`	`}`
`50`	`50`	`}`
	`51`	`+ debugPrintfEXT("(packed=%d)\\n", packed_int);`
`51`	`52`	`}`
`52`	`53`	`debugPrintfEXT("\\n");`
`53`	`54`	`}`
Original file line number	Diff line number	Diff line change
`@@ -35,17 +35,17 @@ void initialize(out Int8OutTile tile) {`
`35`	`35`
`36`	`36`	`void printInt8OutTile(const Int8OutTile tile) {`
`37`	`37`	`debugPrintfEXT(`
`38`		`- "Int8InputTile [TILE_M4=%d][TILE_N4=%d]:\\n", TILE_M4, TILE_N4);`
	`38`	`+ "Int8OutTile [TILE_M4=%d][TILE_N4=%d]:\\n", TILE_M4, TILE_N4);`
`39`	`39`
`40`	`40`	`[[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) {`
`41`	`41`	`[[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {`
`42`		`- debugPrintfEXT(" tile[%d][%d] (ivec4): ", m4, n4);`
	`42`	`+ debugPrintfEXT(" tile[%d][%d]:\\n", m4, n4);`
`43`	`43`
`44`	`44`	`// Each ivec4 contains 4 packed integers, each integer contains 4 8-bit`
`45`	`45`	`// values`
`46`	`46`	`[[unroll]] for (int vec_idx = 0; vec_idx < 4; ++vec_idx) {`
`47`	`47`	`int packed_int = tile.data[m4][n4][vec_idx];`
`48`		`- debugPrintfEXT("packed_int[%d]=%d -> [", vec_idx, packed_int);`
	`48`	`+ debugPrintfEXT(" [", vec_idx, packed_int);`
`49`	`49`
`50`	`50`	`// Extract 4 8-bit values from this packed integer`
`51`	`51`	`[[unroll]] for (int byte_idx = 0; byte_idx < 4; ++byte_idx) {`
`@@ -56,6 +56,7 @@ void printInt8OutTile(const Int8OutTile tile) {`
`56`	`56`	`debugPrintfEXT("%d] ", val);`
`57`	`57`	`}`
`58`	`58`	`}`
	`59`	`+ debugPrintfEXT("(packed=%d)\\n", packed_int);`
`59`	`60`	`}`
`60`	`61`	`debugPrintfEXT("\\n");`
`61`	`62`	`}`