Skip to content

[ET-VK][Ops] enabling double support for quantization and dequantization ops #11553

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Jun 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
33e9b53
[ET-VK][Ops] enabling double support for quantization and dequantizat…
Jun 11, 2025
26b834e
Update on "[ET-VK][Ops] enabling double support for quantization and …
Jun 11, 2025
7d22726
Update on "[ET-VK][Ops] enabling double support for quantization and …
Jun 11, 2025
cd7c8d6
Update on "[ET-VK][Ops] enabling double support for quantization and …
Jun 12, 2025
3015eec
Update on "[ET-VK][Ops] enabling double support for quantization and …
Jun 12, 2025
8eb9125
Update on "[ET-VK][Ops] enabling double support for quantization and …
Jun 12, 2025
57093c0
Update on "[ET-VK][Ops] enabling double support for quantization and …
Jun 12, 2025
afe1f42
Update on "[ET-VK][Ops] enabling double support for quantization and …
Jun 13, 2025
a553b06
Update on "[ET-VK][Ops] enabling double support for quantization and …
Jun 13, 2025
78332f8
Update on "[ET-VK][Ops] enabling double support for quantization and …
Jun 13, 2025
fc3f7b4
Update on "[ET-VK][Ops] enabling double support for quantization and …
Jun 13, 2025
2de4d77
Update on "[ET-VK][Ops] enabling double support for quantization and …
Jun 16, 2025
240f60b
Update on "[ET-VK][Ops] enabling double support for quantization and …
Jun 16, 2025
0b386f4
Update on "[ET-VK][Ops] enabling double support for quantization and …
Jun 17, 2025
ed60d2d
Update on "[ET-VK][Ops] enabling double support for quantization and …
Jun 18, 2025
dceb1a1
Update on "[ET-VK][Ops] enabling double support for quantization and …
Jun 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dequantize_buffer:
OUT_DTYPE:
- VALUE: half
- VALUE: float
- VALUE: double
shader_variants:
- NAME: dequantize_per_tensor_buffer
MODE: per_tensor
Expand Down
10 changes: 8 additions & 2 deletions backends/vulkan/runtime/graph/ops/glsl/dequantize_texture.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,10 @@ void dequantize_per_tensor() {
[[unroll]] for (int i = 0; i < 4; ++i) {
IN_T qvalue = IN_T(intex[i]);
OUT_T value = dequantize_val(qvalue, scale, zero_point);
outtex[i] = value;
$if OUT_DTYPE == "double":
outtex[i] = float(value);
$else:
outtex[i] = value;
}
write_texel(t_out, pos, outtex);
}
Expand Down Expand Up @@ -177,7 +180,10 @@ void dequantize_per_token() {
[[unroll]] for (int i = 0; i < 4; ++i) {
IN_T qvalue = IN_T(intex[i]);
OUT_T value = dequantize_val(qvalue, scale_val, zero_point_val);
outtex[i] = value;
$if OUT_DTYPE == "double":
outtex[i] = float(value);
$else:
outtex[i] = value;
}

write_texel(t_out, pos, outtex);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dequantize_texture:
OUT_DTYPE:
- VALUE: half
- VALUE: float
- VALUE: double
shader_variants:
- NAME: dequantize_per_tensor_texture3d
MODE: per_tensor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ quantize_buffer:
IN_DTYPE:
- VALUE: half
- VALUE: float
- VALUE: double
OUT_DTYPE:
- VALUE: uint8
- VALUE: int8
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ quantize_texture:
IN_DTYPE:
- VALUE: half
- VALUE: float
- VALUE: double
OUT_DTYPE:
- VALUE: uint8
- VALUE: int8
Expand Down
2 changes: 2 additions & 0 deletions backends/vulkan/runtime/graph/ops/impl/Quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ void quantize_per_tensor_impl(

// Verify input is a floating point type
VK_CHECK_COND(
graph.dtype_of(input) == vkapi::kDouble ||
graph.dtype_of(input) == vkapi::kFloat ||
graph.dtype_of(input) == vkapi::kHalf);

Expand All @@ -214,6 +215,7 @@ void quantize_per_token_impl(

// Verify input is a floating point type
VK_CHECK_COND(
graph.dtype_of(input) == vkapi::kDouble ||
graph.dtype_of(input) == vkapi::kFloat ||
graph.dtype_of(input) == vkapi::kHalf);

Expand Down
51 changes: 51 additions & 0 deletions backends/vulkan/test/op_tests/dequantize_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,12 @@ void test_vulkan_dequantize_per_tensor(
vkcompute::utils::kBuffer,
vkcompute::utils::kBuffer);

// Telling the system to expect a float instead of a double
// since the shader can only return 32bit anyways
if (out_dtype == at::kDouble) {
out_dtype = at::kFloat;
}

// Test with texture storage
test_vulkan_dequantize_per_tensor_impl(
input_sizes,
Expand Down Expand Up @@ -400,6 +406,12 @@ void test_vulkan_dequantize_per_token(
vkcompute::utils::kBuffer,
vkcompute::utils::kBuffer);

// Telling the system to expect a float instead of a double
// since the shader can only return 32bit anyways
if (out_dtype == at::kDouble) {
out_dtype = at::kFloat;
}

// Test with texture storage
test_vulkan_dequantize_per_token_impl(
input_sizes,
Expand Down Expand Up @@ -793,6 +805,24 @@ TEST(
at::kHalf); // output dtype
}

TEST(
VulkanDequantizePerTensorTest,
test_vulkan_dequantize_per_tensor_int8_to_double) {
if (!vkcompute::api::context()
->adapter_ptr()
->has_full_int8_buffers_support()) {
GTEST_SKIP();
}
test_vulkan_dequantize_per_tensor(
{2, 3}, // input sizes
0.05, // scale
10, // zero_point
-128, // quant_min
127, // quant_max
at::kChar, // input dtype
at::kDouble); // output dtype
}

void test_reference_dequantize_per_token(
const std::vector<int>& input_sizes,
const std::vector<float>& scales,
Expand Down Expand Up @@ -1288,3 +1318,24 @@ TEST(
at::kInt, // input dtype
at::kHalf); // output dtype
}

TEST(
VulkanDequantizePerTokenTest,
test_vulkan_dequantize_per_token_int8_to_double) {
if (!vkcompute::api::context()
->adapter_ptr()
->has_full_int8_buffers_support()) {
GTEST_SKIP();
}
std::vector<float> scales = {0.05, 0.001};
std::vector<int> zero_points = {10, -5};

test_vulkan_dequantize_per_token(
{2, 2}, // input sizes (2 tokens)
scales,
zero_points,
-128, // quant_min
127, // quant_max
at::kChar, // input dtype
at::kDouble); // output dtype
}
51 changes: 51 additions & 0 deletions backends/vulkan/test/op_tests/quantize_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,12 @@ void test_vulkan_quantize_per_tensor(
vkcompute::utils::kBuffer,
vkcompute::utils::kBuffer);

// If the in_dtype is a double, convert to float for texture implementation
// since they don't support 64bit as inputs
if (in_dtype == at::kDouble) {
in_dtype = at::kFloat;
}

// Test with texture storage
test_vulkan_quantize_per_tensor_impl(
input_sizes,
Expand Down Expand Up @@ -349,6 +355,12 @@ void test_vulkan_quantize_per_token(
vkcompute::utils::kBuffer,
vkcompute::utils::kBuffer);

// If the in_dtype is a double, convert to float for texture implementation
// since they don't support 64bit as inputs
if (in_dtype == at::kDouble) {
in_dtype = at::kFloat;
}

// Test with texture storage
test_vulkan_quantize_per_token_impl(
input_sizes,
Expand Down Expand Up @@ -655,6 +667,24 @@ TEST(
at::kChar); // output dtype
}

TEST(
VulkanQuantizePerTensorTest,
test_vulkan_quantize_per_tensor_double_to_int8) {
if (!vkcompute::api::context()
->adapter_ptr()
->has_full_int8_buffers_support()) {
GTEST_SKIP();
}
test_vulkan_quantize_per_tensor(
{2, 3}, // input sizes
0.01, // scale
1, // zero_point
-128, // quant_min
127, // quant_max
at::kDouble, // input dtype
at::kChar); // output dtype
}

void test_reference_quantize_per_token(
const std::vector<int>& input_sizes,
const std::vector<float>& pre_scales,
Expand Down Expand Up @@ -1075,3 +1105,24 @@ TEST(VulkanQuantizePerTensorTest, test_vulkan_quantize_per_token_half_to_int8) {
at::kHalf, // input dtype
at::kChar); // output dtype
}

TEST(
VulkanQuantizePerTensorTest,
test_vulkan_quantize_per_token_double_to_int8) {
if (!vkcompute::api::context()
->adapter_ptr()
->has_full_int8_buffers_support()) {
GTEST_SKIP();
}
std::vector<float> scales = {0.1, 0.2};
std::vector<int> zero_points = {0, 5};

test_vulkan_quantize_per_token(
{2, 2}, // input sizes (2*2=4 tokens)
scales,
zero_points,
-128, // quant_min
127, // quant_max
at::kDouble, // input dtype
at::kChar); // output dtype
}
Loading