From 33e9b531b96c1ee9fa581205c9e11ca28150b52b Mon Sep 17 00:00:00 2001 From: morelos Date: Wed, 11 Jun 2025 09:59:29 -0700 Subject: [PATCH] [ET-VK][Ops] enabling double support for quantization and dequantization ops With the added double support in the layout template, this diff is enabling it as input/output for dequantization. Since there are limitations with how 64bit can be supported, the expectation is that IO be downgraded to 32bit Differential Revision: [D76289197](https://our.internmc.facebook.com/intern/diff/D76289197/) [ghstack-poisoned] --- .../runtime/graph/ops/glsl/dequantize.glsl | 10 ++++- .../runtime/graph/ops/glsl/dequantize.yaml | 1 + .../runtime/graph/ops/glsl/quantize.yaml | 1 + .../runtime/graph/ops/impl/Quantize.cpp | 2 + .../vulkan/test/op_tests/dequantize_test.cpp | 41 +++++++++++++++++++ .../vulkan/test/op_tests/quantize_test.cpp | 41 +++++++++++++++++++ 6 files changed, 94 insertions(+), 2 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/dequantize.glsl b/backends/vulkan/runtime/graph/ops/glsl/dequantize.glsl index 478c175e523..4da5773738f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/dequantize.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/dequantize.glsl @@ -155,7 +155,10 @@ $if MODE == "per_tensor": [[unroll]] for (int i = 0; i < 4; ++i) { IN_T qvalue = IN_T(intex[i]); OUT_T value = dequantize_val(qvalue, scale, zero_point); - outtex[i] = value; + $if OUT_DTYPE == "double": + outtex[i] = float(value); + $else: + outtex[i] = value; } write_texel(t_out, pos, outtex); @@ -198,7 +201,10 @@ $if MODE == "per_token": [[unroll]] for (int i = 0; i < 4; ++i) { IN_T qvalue = IN_T(intex[i]); OUT_T value = dequantize_val(qvalue, scale_val, zero_point_val); - outtex[i] = value; + $if OUT_DTYPE == "double": + outtex[i] = float(value); + $else: + outtex[i] = value; } write_texel(t_out, pos, outtex); diff --git a/backends/vulkan/runtime/graph/ops/glsl/dequantize.yaml b/backends/vulkan/runtime/graph/ops/glsl/dequantize.yaml index 0b6f3f10d1e..ca201165c12 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/dequantize.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/dequantize.yaml @@ -15,6 +15,7 @@ dequantize: OUT_DTYPE: - VALUE: half - VALUE: float + - VALUE: double shader_variants: - NAME: dequantize_per_tensor MODE: per_tensor diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize.yaml index 985ede6efe8..97fc77f36fb 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/quantize.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/quantize.yaml @@ -11,6 +11,7 @@ quantize: IN_DTYPE: - VALUE: half - VALUE: float + - VALUE: double OUT_DTYPE: - VALUE: uint8 - VALUE: int8 diff --git a/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp index 8cd92008b73..290e1a8f763 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp @@ -191,6 +191,7 @@ void quantize_per_tensor_impl( // Verify input is a floating point type VK_CHECK_COND( + graph.dtype_of(input) == vkapi::kDouble || graph.dtype_of(input) == vkapi::kFloat || graph.dtype_of(input) == vkapi::kHalf); @@ -214,6 +215,7 @@ void quantize_per_token_impl( // Verify input is a floating point type VK_CHECK_COND( + graph.dtype_of(input) == vkapi::kDouble || graph.dtype_of(input) == vkapi::kFloat || graph.dtype_of(input) == vkapi::kHalf); diff --git a/backends/vulkan/test/op_tests/dequantize_test.cpp b/backends/vulkan/test/op_tests/dequantize_test.cpp index 597eb431e35..79d024eb685 100644 --- a/backends/vulkan/test/op_tests/dequantize_test.cpp +++ b/backends/vulkan/test/op_tests/dequantize_test.cpp @@ -462,6 +462,12 @@ void test_vulkan_dequantize_per_tensor( vkcompute::utils::kBuffer, vkcompute::utils::kBuffer); + // Telling the system to expect a float instead of a double + // since the shader can only return 32bit anyways + if (out_dtype == at::kDouble) { + out_dtype = at::kFloat; + } + // Test with texture storage test_vulkan_dequantize_per_tensor_impl( input_sizes, @@ -496,6 +502,12 @@ void test_vulkan_dequantize_per_token( vkcompute::utils::kBuffer, vkcompute::utils::kBuffer); + // Telling the system to expect a float instead of a double + // since the shader can only return 32bit anyways + if (out_dtype == at::kDouble) { + out_dtype = at::kFloat; + } + // Test with texture storage test_vulkan_dequantize_per_token_impl( input_sizes, @@ -790,6 +802,19 @@ TEST( at::kFloat); // output dtype } +TEST( + VulkanDequantizePerTensorTest, + test_vulkan_dequantize_per_tensor_int32_to_double) { + test_vulkan_dequantize_per_tensor( + {2, 4, 3}, // input sizes + 0.0001, // scale + 100, // zero_point + -2147483648, // quant_min + 2147483647, // quant_max + at::kInt, // input dtype + at::kDouble); // output dtype +} + void test_reference_dequantize_per_token( const std::vector& input_sizes, const std::vector& scales, @@ -1165,3 +1190,19 @@ TEST( at::kInt, // input dtype at::kFloat); // output dtype } + +TEST( + VulkanDequantizePerTokenTest, + test_vulkan_dequantize_per_token_int32_to_double) { + std::vector scales = {0.0001, 0.0002, 0.0003, 0.0}; + std::vector zero_points = {100, -100, 50, -50}; + + test_vulkan_dequantize_per_token( + {2, 2, 8}, // input sizes (2*2=4 tokens) + scales, + zero_points, + -2147483648, // quant_min + 2147483647, // quant_max + at::kInt, // input dtype + at::kDouble); // output dtype +} diff --git a/backends/vulkan/test/op_tests/quantize_test.cpp b/backends/vulkan/test/op_tests/quantize_test.cpp index 87c0aaaed82..39581e523fa 100644 --- a/backends/vulkan/test/op_tests/quantize_test.cpp +++ b/backends/vulkan/test/op_tests/quantize_test.cpp @@ -432,6 +432,12 @@ void test_vulkan_quantize_per_tensor( vkcompute::utils::kBuffer, vkcompute::utils::kBuffer); + // If the in_dtype is a double, convert to float for texture implementation + // since they don't support 64bit as inputs + if (in_dtype == at::kDouble) { + in_dtype = at::kFloat; + } + // Test with texture storage test_vulkan_quantize_per_tensor_impl( input_sizes, @@ -466,6 +472,12 @@ void test_vulkan_quantize_per_token( vkcompute::utils::kBuffer, vkcompute::utils::kBuffer); + // If the in_dtype is a double, convert to float for texture implementation + // since they don't support 64bit as inputs + if (in_dtype == at::kDouble) { + in_dtype = at::kFloat; + } + // Test with texture storage test_vulkan_quantize_per_token_impl( input_sizes, @@ -718,6 +730,19 @@ TEST( at::kChar); // output dtype } +TEST( + VulkanQuantizePerTensorTest, + test_vulkan_quantize_per_tensor_double_to_int8) { + test_vulkan_quantize_per_tensor( + {2, 3}, // input sizes + 0.01, // scale + 1, // zero_point + -128, // quant_min + 127, // quant_max + at::kDouble, // input dtype + at::kChar); // output dtype +} + void test_reference_quantize_per_token( const std::vector& input_sizes, const std::vector& pre_scales, @@ -1064,3 +1089,19 @@ TEST(VulkanQuantizePerTensorTest, test_vulkan_quantize_per_token_half_to_int8) { at::kHalf, // input dtype at::kChar); // output dtype } + +TEST( + VulkanQuantizePerTensorTest, + test_vulkan_quantize_per_token_double_to_int8) { + std::vector scales = {0.1, 0.2}; + std::vector zero_points = {0, 5}; + + test_vulkan_quantize_per_token( + {2, 2}, // input sizes (2*2=4 tokens) + scales, + zero_points, + -128, // quant_min + 127, // quant_max + at::kDouble, // input dtype + at::kChar); // output dtype +}