[ET-VK] Allow buffer input/output for quantize/dequantize for conv2d ops (#15747)

pytorchbot · web-flow · commit f4af424bce37 · 2025-11-11T13:47:01.000-05:00
Title says it all! This diff allows quantize/dequantize ops to consume/produce tensors in the `CONTIGUOUS_BUFFER` layout. This can help reduce the number of memory layout transitions needed to execute a model. Differential Revision: [D86674166](https://our.internmc.facebook.com/intern/diff/D86674166/)
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -636,7 +636,7 @@ def register_quantized_binary_op():
 def register_quantize_for_conv2d_op():
     return OpFeatures(
         inputs_storage=[
-            utils.CHANNELS_PACKED_TEXTURE,
+            utils.CHANNELS_PACKED_TEXTURE_OR_CONTIGUOUS_BUFFER,
         ],
         outputs_storage=[
             utils.PACKED_INT8_4W4C_BUFFER,
@@ -656,7 +656,7 @@ def register_dequantize_for_conv2d_op():
             utils.PACKED_INT8_4W4C_BUFFER,
         ],
         outputs_storage=[
-            utils.CHANNELS_PACKED_TEXTURE,
+            utils.CHANNELS_PACKED_TEXTURE_OR_CONTIGUOUS_BUFFER,
         ],
         supports_resize=False,
     )
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh
@@ -14,7 +14,21 @@
 #include "linear_fp_input_tile.glslh"
 
 VEC4_T load_fp_input_texel(const Conv2dTensorIndex tidx) {
+#ifdef INPUT_BUFFER
+  VEC4_T texel = VEC4_T(0);
+  const int c_idx = mul_4(tidx.data.z);
+  const int c_stride = input_sizes.y * input_sizes.x;
+
+  const int base_buf_i = c_idx * c_stride + tidx.data.y * input_sizes.x + tidx.data.x;
+  const int limit = min(input_sizes.z - c_idx, 4);
+
+  for (int i = 0; i < limit; i++) {
+    texel[i] = t_fp_input[base_buf_i + i * c_stride];
+  }
+  return texel;
+#else
   return texelFetch(t_fp_input, tidx.data, 0);
+#endif
 }
 
 void load_fp_input_tile(
@@ -23,7 +37,9 @@ void load_fp_input_tile(
 #if TILE_M == 4 && TILE_K4 == 1
   Conv2dTensorIndex load_tidx = block_idx_to_tensor_idx(block_idx);
   [[unroll]] for (int w = 0; w < TILE_M; w++) {
-    tile.data[w][0] = load_fp_input_texel(load_tidx);
+    if (load_tidx.data.x < input_sizes.x) {
+      tile.data[w][0] = load_fp_input_texel(load_tidx);
+    }
     load_tidx.data.x++;
   }
 #else
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.glsl
@@ -31,7 +31,7 @@ layout(std430) buffer;
 #include "conv2d_common.glslh"
 
 ${layout_declare_tensor(B, "w", "t_packed_int8_input", "int", OUTPUT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_fp_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_fp_input", DTYPE, INPUT_STORAGE)}
 
 ${layout_declare_ubo(B, "ivec4", "input_sizes")}
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.yaml
@@ -15,6 +15,7 @@ quantize_and_pack_q8ta_conv2d_input:
       combos:
         - parameter_values: [texture3d, texture3d]
         - parameter_values: [buffer, texture3d]
+        - parameter_values: [buffer, buffer]
     DTYPE:
       - VALUE: float
   shader_variants:
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.glsl b/backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.glsl
@@ -30,7 +30,7 @@ layout(std430) buffer;
 
 #include "conv2d_common.glslh"
 
-${layout_declare_tensor(B, "w", "t_fp_output", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "w", "t_fp_output", DTYPE, OUTPUT_STORAGE)}
 ${layout_declare_tensor(B, "r", "t_packed_int8_output", "int", INPUT_STORAGE, is_scalar_array=False)}
 
 ${layout_declare_ubo(B, "ivec4", "output_sizes")}
@@ -84,15 +84,29 @@ void unpack_and_dequantize(
 void store_fp_output_texel(
     const Conv2dTensorIndex tidx,
     const VEC4_T out_texel) {
+#ifdef OUTPUT_BUFFER
+  const int c_idx = mul_4(tidx.data.z);
+  const int c_stride = output_sizes.y * output_sizes.x;
+
+  const int base_buf_i = c_idx * c_stride + tidx.data.y * output_sizes.x + tidx.data.x;
+  const int limit = min(output_sizes.z - c_idx, 4);
+
+  for (int i = 0; i < limit; ++i) {
+    t_fp_output[base_buf_i + i * c_stride] = out_texel[i];
+  }
+#else
   imageStore(t_fp_output, tidx.data, out_texel);
+#endif
 }
 
 void store_fp_tile(
     const FPInputTile block,
     const Conv2dBlockIndex block_idx) {
   Conv2dTensorIndex store_tidx = block_idx_to_tensor_idx(block_idx);
   [[unroll]] for (int w = 0; w < 4; w++) {
-    store_fp_output_texel(store_tidx, block.data[w][0]);
+    if (store_tidx.data.x < output_sizes.x) {
+      store_fp_output_texel(store_tidx, block.data[w][0]);
+    }
     store_tidx.data.x++;
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.yaml b/backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.yaml
@@ -15,6 +15,7 @@ unpack_and_dequantize_q8ta_conv2d_output:
       combos:
         - parameter_values: [texture3d, texture3d]
         - parameter_values: [texture3d, buffer]
+        - parameter_values: [buffer, buffer]
     DTYPE:
       - VALUE: float
   shader_variants:
diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp
@@ -47,11 +47,15 @@ TestCase create_test_case_from_config(
   std::vector<int64_t> input_size = {
       1, config.channels.in, config.input_size.h, config.input_size.w};
 
+  utils::GPUMemoryLayout io_memory_layout = storage_type == utils::kBuffer
+      ? utils::kWidthPacked
+      : utils::kChannelsPacked;
+
   ValueSpec input_tensor(
       input_size,
       input_dtype,
       storage_type,
-      utils::kChannelsPacked,
+      io_memory_layout,
       DataGenType::RANDOM);
 
   if (debugging()) {
@@ -139,7 +143,7 @@ TestCase create_test_case_from_config(
       {1, config.channels.out, H_out, W_out},
       input_dtype,
       storage_type,
-      utils::kChannelsPacked,
+      io_memory_layout,
       DataGenType::ZEROS);
 
   // Add all specs to test case for q8ta_q8csw_q8to operation
@@ -182,7 +186,8 @@ std::vector<TestCase> generate_quantized_conv2d_easy_cases() {
   config.op_name = "conv2d_q8ta_q8csw_q8to";
 
   // Test with both storage types and data types for completeness
-  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
+  std::vector<utils::StorageType> storage_types = {
+      utils::kTexture3D, utils::kBuffer};
   std::vector<vkapi::ScalarType> float_types = {vkapi::kFloat};
 
   // Generate test cases for each combination
@@ -341,7 +346,8 @@ std::vector<TestCase> generate_quantized_conv2d_test_cases() {
        4}};
 
   // Test with different storage types and data types
-  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
+  std::vector<utils::StorageType> storage_types = {
+      utils::kTexture3D, utils::kBuffer};
 
   // Generate test cases for each combination
   for (auto& config : configs) {
@@ -621,7 +627,7 @@ int main(int argc, char* argv[]) {
       quantized_conv2d_flop_calculator,
       "QuantizedConv2dQ8ToQ8To",
       0,
-      10,
+      1,
       ref_fn);
 
   return 0;
diff --git a/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp b/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp
@@ -38,21 +38,17 @@ TestCase create_quantized_add_test_case(
   // Set the operator name for the test case
   test_case.set_operator_name("et_vk.add_q8ta_q8ta_q8to.test");
 
+  utils::GPUMemoryLayout io_memory_layout = storage_type == utils::kBuffer
+      ? utils::kWidthPacked
+      : utils::kChannelsPacked;
+
   // Input tensor A (float/half)
   ValueSpec input_a(
-      sizes,
-      input_dtype,
-      storage_type,
-      utils::kChannelsPacked,
-      DataGenType::RANDOM);
+      sizes, input_dtype, storage_type, io_memory_layout, DataGenType::RANDOM);
 
   // Input tensor B (float/half)
   ValueSpec input_b(
-      sizes,
-      input_dtype,
-      storage_type,
-      utils::kChannelsPacked,
-      DataGenType::RANDOM);
+      sizes, input_dtype, storage_type, io_memory_layout, DataGenType::RANDOM);
 
   // Quantization parameters for input A
   float input_a_scale_val = 0.007843; // 2/255 approximately
@@ -81,11 +77,7 @@ TestCase create_quantized_add_test_case(
 
   // Output tensor (float/half)
   ValueSpec output(
-      sizes,
-      input_dtype,
-      storage_type,
-      utils::kChannelsPacked,
-      DataGenType::ZEROS);
+      sizes, input_dtype, storage_type, io_memory_layout, DataGenType::ZEROS);
 
   // Add all specs to test case for q8ta_q8ta_q8to add operation
   test_case.add_input_spec(input_a);
@@ -119,7 +111,8 @@ std::vector<TestCase> generate_quantized_add_test_cases() {
   };
 
   // Storage types to test
-  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
+  std::vector<utils::StorageType> storage_types = {
+      utils::kTexture3D, utils::kBuffer};
 
   // Data types to test
   std::vector<vkapi::ScalarType> data_types = {vkapi::kFloat};
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
@@ -772,6 +772,14 @@ def make_filtered_tensor_repset(
 HEIGHT_PACKED_TEXTURE = TensorRepSet(set(), {VkMemoryLayout.TENSOR_HEIGHT_PACKED})
 CHANNELS_PACKED_TEXTURE = TensorRepSet(set(), {VkMemoryLayout.TENSOR_CHANNELS_PACKED})
 
+CHANNELS_PACKED_ANY = TensorRepSet(
+    {VkMemoryLayout.TENSOR_CHANNELS_PACKED}, {VkMemoryLayout.TENSOR_CHANNELS_PACKED}
+)
+
+CHANNELS_PACKED_TEXTURE_OR_CONTIGUOUS_BUFFER = TensorRepSet(
+    {VkMemoryLayout.TENSOR_WIDTH_PACKED}, {VkMemoryLayout.TENSOR_CHANNELS_PACKED}
+)
+
 ANY_TEXTURE = TensorRepSet(set(), all_memory_layouts)
 ANY_BUFFER = TensorRepSet(all_memory_layouts, set())