Skip to content

Commit f4af424

Browse files
authored
[ET-VK] Allow buffer input/output for quantize/dequantize for conv2d ops (#15747)
Title says it all! This diff allows quantize/dequantize ops to consume/produce tensors in the `CONTIGUOUS_BUFFER` layout. This can help reduce the number of memory layout transitions needed to execute a model. Differential Revision: [D86674166](https://our.internmc.facebook.com/intern/diff/D86674166/)
1 parent c998f8a commit f4af424

9 files changed

+66
-27
lines changed

backends/vulkan/op_registry.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -636,7 +636,7 @@ def register_quantized_binary_op():
636636
def register_quantize_for_conv2d_op():
637637
return OpFeatures(
638638
inputs_storage=[
639-
utils.CHANNELS_PACKED_TEXTURE,
639+
utils.CHANNELS_PACKED_TEXTURE_OR_CONTIGUOUS_BUFFER,
640640
],
641641
outputs_storage=[
642642
utils.PACKED_INT8_4W4C_BUFFER,
@@ -656,7 +656,7 @@ def register_dequantize_for_conv2d_op():
656656
utils.PACKED_INT8_4W4C_BUFFER,
657657
],
658658
outputs_storage=[
659-
utils.CHANNELS_PACKED_TEXTURE,
659+
utils.CHANNELS_PACKED_TEXTURE_OR_CONTIGUOUS_BUFFER,
660660
],
661661
supports_resize=False,
662662
)

backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,21 @@
1414
#include "linear_fp_input_tile.glslh"
1515

1616
VEC4_T load_fp_input_texel(const Conv2dTensorIndex tidx) {
17+
#ifdef INPUT_BUFFER
18+
VEC4_T texel = VEC4_T(0);
19+
const int c_idx = mul_4(tidx.data.z);
20+
const int c_stride = input_sizes.y * input_sizes.x;
21+
22+
const int base_buf_i = c_idx * c_stride + tidx.data.y * input_sizes.x + tidx.data.x;
23+
const int limit = min(input_sizes.z - c_idx, 4);
24+
25+
for (int i = 0; i < limit; i++) {
26+
texel[i] = t_fp_input[base_buf_i + i * c_stride];
27+
}
28+
return texel;
29+
#else
1730
return texelFetch(t_fp_input, tidx.data, 0);
31+
#endif
1832
}
1933

2034
void load_fp_input_tile(
@@ -23,7 +37,9 @@ void load_fp_input_tile(
2337
#if TILE_M == 4 && TILE_K4 == 1
2438
Conv2dTensorIndex load_tidx = block_idx_to_tensor_idx(block_idx);
2539
[[unroll]] for (int w = 0; w < TILE_M; w++) {
26-
tile.data[w][0] = load_fp_input_texel(load_tidx);
40+
if (load_tidx.data.x < input_sizes.x) {
41+
tile.data[w][0] = load_fp_input_texel(load_tidx);
42+
}
2743
load_tidx.data.x++;
2844
}
2945
#else

backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.glsl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ layout(std430) buffer;
3131
#include "conv2d_common.glslh"
3232

3333
${layout_declare_tensor(B, "w", "t_packed_int8_input", "int", OUTPUT_STORAGE, is_scalar_array=False)}
34-
${layout_declare_tensor(B, "r", "t_fp_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)}
34+
${layout_declare_tensor(B, "r", "t_fp_input", DTYPE, INPUT_STORAGE)}
3535

3636
${layout_declare_ubo(B, "ivec4", "input_sizes")}
3737

backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ quantize_and_pack_q8ta_conv2d_input:
1515
combos:
1616
- parameter_values: [texture3d, texture3d]
1717
- parameter_values: [buffer, texture3d]
18+
- parameter_values: [buffer, buffer]
1819
DTYPE:
1920
- VALUE: float
2021
shader_variants:

backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.glsl

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ layout(std430) buffer;
3030

3131
#include "conv2d_common.glslh"
3232

33-
${layout_declare_tensor(B, "w", "t_fp_output", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)}
33+
${layout_declare_tensor(B, "w", "t_fp_output", DTYPE, OUTPUT_STORAGE)}
3434
${layout_declare_tensor(B, "r", "t_packed_int8_output", "int", INPUT_STORAGE, is_scalar_array=False)}
3535

3636
${layout_declare_ubo(B, "ivec4", "output_sizes")}
@@ -84,15 +84,29 @@ void unpack_and_dequantize(
8484
void store_fp_output_texel(
8585
const Conv2dTensorIndex tidx,
8686
const VEC4_T out_texel) {
87+
#ifdef OUTPUT_BUFFER
88+
const int c_idx = mul_4(tidx.data.z);
89+
const int c_stride = output_sizes.y * output_sizes.x;
90+
91+
const int base_buf_i = c_idx * c_stride + tidx.data.y * output_sizes.x + tidx.data.x;
92+
const int limit = min(output_sizes.z - c_idx, 4);
93+
94+
for (int i = 0; i < limit; ++i) {
95+
t_fp_output[base_buf_i + i * c_stride] = out_texel[i];
96+
}
97+
#else
8798
imageStore(t_fp_output, tidx.data, out_texel);
99+
#endif
88100
}
89101

90102
void store_fp_tile(
91103
const FPInputTile block,
92104
const Conv2dBlockIndex block_idx) {
93105
Conv2dTensorIndex store_tidx = block_idx_to_tensor_idx(block_idx);
94106
[[unroll]] for (int w = 0; w < 4; w++) {
95-
store_fp_output_texel(store_tidx, block.data[w][0]);
107+
if (store_tidx.data.x < output_sizes.x) {
108+
store_fp_output_texel(store_tidx, block.data[w][0]);
109+
}
96110
store_tidx.data.x++;
97111
}
98112
}

backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ unpack_and_dequantize_q8ta_conv2d_output:
1515
combos:
1616
- parameter_values: [texture3d, texture3d]
1717
- parameter_values: [texture3d, buffer]
18+
- parameter_values: [buffer, buffer]
1819
DTYPE:
1920
- VALUE: float
2021
shader_variants:

backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,15 @@ TestCase create_test_case_from_config(
4747
std::vector<int64_t> input_size = {
4848
1, config.channels.in, config.input_size.h, config.input_size.w};
4949

50+
utils::GPUMemoryLayout io_memory_layout = storage_type == utils::kBuffer
51+
? utils::kWidthPacked
52+
: utils::kChannelsPacked;
53+
5054
ValueSpec input_tensor(
5155
input_size,
5256
input_dtype,
5357
storage_type,
54-
utils::kChannelsPacked,
58+
io_memory_layout,
5559
DataGenType::RANDOM);
5660

5761
if (debugging()) {
@@ -139,7 +143,7 @@ TestCase create_test_case_from_config(
139143
{1, config.channels.out, H_out, W_out},
140144
input_dtype,
141145
storage_type,
142-
utils::kChannelsPacked,
146+
io_memory_layout,
143147
DataGenType::ZEROS);
144148

145149
// Add all specs to test case for q8ta_q8csw_q8to operation
@@ -182,7 +186,8 @@ std::vector<TestCase> generate_quantized_conv2d_easy_cases() {
182186
config.op_name = "conv2d_q8ta_q8csw_q8to";
183187

184188
// Test with both storage types and data types for completeness
185-
std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
189+
std::vector<utils::StorageType> storage_types = {
190+
utils::kTexture3D, utils::kBuffer};
186191
std::vector<vkapi::ScalarType> float_types = {vkapi::kFloat};
187192

188193
// Generate test cases for each combination
@@ -341,7 +346,8 @@ std::vector<TestCase> generate_quantized_conv2d_test_cases() {
341346
4}};
342347

343348
// Test with different storage types and data types
344-
std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
349+
std::vector<utils::StorageType> storage_types = {
350+
utils::kTexture3D, utils::kBuffer};
345351

346352
// Generate test cases for each combination
347353
for (auto& config : configs) {
@@ -621,7 +627,7 @@ int main(int argc, char* argv[]) {
621627
quantized_conv2d_flop_calculator,
622628
"QuantizedConv2dQ8ToQ8To",
623629
0,
624-
10,
630+
1,
625631
ref_fn);
626632

627633
return 0;

backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -38,21 +38,17 @@ TestCase create_quantized_add_test_case(
3838
// Set the operator name for the test case
3939
test_case.set_operator_name("et_vk.add_q8ta_q8ta_q8to.test");
4040

41+
utils::GPUMemoryLayout io_memory_layout = storage_type == utils::kBuffer
42+
? utils::kWidthPacked
43+
: utils::kChannelsPacked;
44+
4145
// Input tensor A (float/half)
4246
ValueSpec input_a(
43-
sizes,
44-
input_dtype,
45-
storage_type,
46-
utils::kChannelsPacked,
47-
DataGenType::RANDOM);
47+
sizes, input_dtype, storage_type, io_memory_layout, DataGenType::RANDOM);
4848

4949
// Input tensor B (float/half)
5050
ValueSpec input_b(
51-
sizes,
52-
input_dtype,
53-
storage_type,
54-
utils::kChannelsPacked,
55-
DataGenType::RANDOM);
51+
sizes, input_dtype, storage_type, io_memory_layout, DataGenType::RANDOM);
5652

5753
// Quantization parameters for input A
5854
float input_a_scale_val = 0.007843; // 2/255 approximately
@@ -81,11 +77,7 @@ TestCase create_quantized_add_test_case(
8177

8278
// Output tensor (float/half)
8379
ValueSpec output(
84-
sizes,
85-
input_dtype,
86-
storage_type,
87-
utils::kChannelsPacked,
88-
DataGenType::ZEROS);
80+
sizes, input_dtype, storage_type, io_memory_layout, DataGenType::ZEROS);
8981

9082
// Add all specs to test case for q8ta_q8ta_q8to add operation
9183
test_case.add_input_spec(input_a);
@@ -119,7 +111,8 @@ std::vector<TestCase> generate_quantized_add_test_cases() {
119111
};
120112

121113
// Storage types to test
122-
std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
114+
std::vector<utils::StorageType> storage_types = {
115+
utils::kTexture3D, utils::kBuffer};
123116

124117
// Data types to test
125118
std::vector<vkapi::ScalarType> data_types = {vkapi::kFloat};

backends/vulkan/utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -772,6 +772,14 @@ def make_filtered_tensor_repset(
772772
HEIGHT_PACKED_TEXTURE = TensorRepSet(set(), {VkMemoryLayout.TENSOR_HEIGHT_PACKED})
773773
CHANNELS_PACKED_TEXTURE = TensorRepSet(set(), {VkMemoryLayout.TENSOR_CHANNELS_PACKED})
774774

775+
CHANNELS_PACKED_ANY = TensorRepSet(
776+
{VkMemoryLayout.TENSOR_CHANNELS_PACKED}, {VkMemoryLayout.TENSOR_CHANNELS_PACKED}
777+
)
778+
779+
CHANNELS_PACKED_TEXTURE_OR_CONTIGUOUS_BUFFER = TensorRepSet(
780+
{VkMemoryLayout.TENSOR_WIDTH_PACKED}, {VkMemoryLayout.TENSOR_CHANNELS_PACKED}
781+
)
782+
775783
ANY_TEXTURE = TensorRepSet(set(), all_memory_layouts)
776784
ANY_BUFFER = TensorRepSet(all_memory_layouts, set())
777785

0 commit comments

Comments
 (0)