Skip to content

[ET-VK][Ops] quantize_per_token.default test setup #11673

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
384 changes: 384 additions & 0 deletions backends/vulkan/test/op_tests/quantize_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,387 @@ void check_quantize_args(
" actual quant_max: ",
quant_max);
}
/*
* Reference implementation of quantize_per_token
*/
at::Tensor quantize_per_token_reference_impl(
const at::Tensor& input,
const at::Tensor& scale,
const at::Tensor& zero_point,
int64_t quant_min,
int64_t quant_max,
at::ScalarType dtype) {
// Create output tensor with the target dtype
at::Tensor out = at::empty_like(input, dtype);

// Calculate number of tokens
int num_tokens = 1;
for (int i = 0; i < input.dim() - 1; i++) {
num_tokens *= input.size(i);
}

// Verify that the number of tokens matches the size of scale and zero_point
// tensors
assert(num_tokens == scale.numel());
assert(num_tokens == zero_point.numel());

// Reshape input to [num_tokens, last_dim]
at::Tensor reshaped_input = input.reshape({num_tokens, input.size(-1)});
at::Tensor reshaped_out = out.reshape({num_tokens, input.size(-1)});

// Quantize each token separately
for (int token_idx = 0; token_idx < num_tokens; token_idx++) {
// Use float for scale since Vulkan doesn't support double
float token_scale = scale[token_idx].item<float>();
// Use int for zero_point since Vulkan doesn't support int64_t
int token_zero_point = zero_point[token_idx].item<int>();

float inv_scale = 1.0 / token_scale;

// Quantize the token
for (int i = 0; i < input.size(-1); i++) {
float value = reshaped_input[token_idx][i].item<float>();
int qvalue = token_zero_point + std::nearbyint(inv_scale * value);

qvalue = std::max<int64_t>(qvalue, quant_min);
qvalue = std::min<int64_t>(qvalue, quant_max);

if (dtype == at::kByte) {
reshaped_out[token_idx][i] = static_cast<uint8_t>(qvalue);
} else if (dtype == at::kChar) {
reshaped_out[token_idx][i] = static_cast<int8_t>(qvalue);
} else if (dtype == at::kShort) {
reshaped_out[token_idx][i] = static_cast<int16_t>(qvalue);
} else if (dtype == at::kInt) {
reshaped_out[token_idx][i] = static_cast<int32_t>(qvalue);
} else if (dtype == at::kLong) {
reshaped_out[token_idx][i] = static_cast<int64_t>(qvalue);
}
}
}

return out;
}

void test_vulkan_quantize_per_token_impl(
const std::vector<int>& input_sizes,
const std::vector<float>& scales,
const std::vector<int>& zero_points,
int64_t quant_min,
int64_t quant_max,
at::ScalarType in_dtype,
at::ScalarType dtype,
const vkcompute::utils::StorageType in_storage,
const vkcompute::utils::StorageType out_storage);

// Wrapper function to test both buffer and texture storage types
void test_vulkan_quantize_per_token(
const std::vector<int>& input_sizes,
const std::vector<float>& scales,
const std::vector<int>& zero_points,
int64_t quant_min,
int64_t quant_max,
at::ScalarType in_dtype = at::kFloat,
at::ScalarType dtype = at::kInt) {
// Test with buffer storage
test_vulkan_quantize_per_token_impl(
input_sizes,
scales,
zero_points,
quant_min,
quant_max,
in_dtype,
dtype,
vkcompute::utils::kBuffer,
vkcompute::utils::kBuffer);

// Test with texture storage
test_vulkan_quantize_per_token_impl(
input_sizes,
scales,
zero_points,
quant_min,
quant_max,
in_dtype,
dtype,
vkcompute::utils::kTexture3D,
vkcompute::utils::kTexture3D);
}

void test_reference_quantize_per_token(
const std::vector<int>& input_sizes,
const std::vector<float>& scales,
const std::vector<int>& zero_points,
int64_t quant_min,
int64_t quant_max,
at::ScalarType in_dtype = at::kFloat,
at::ScalarType dtype = at::kInt) {
check_quantize_args(quant_min, quant_max, dtype);
std::vector<int64_t> input_sizes_int64(
input_sizes.begin(), input_sizes.end());
at::Tensor input =
at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype));

// Fill with a simple pattern: values from 0 to 1 in steps
float step = 1.0 / (input.numel() - 1);
auto flat_input = input.flatten();
for (int i = 0; i < flat_input.numel(); i++) {
flat_input[i] = i * step;
}

// Reshape back to original dimensions
input = flat_input.reshape(input_sizes_int64);

// Calculate number of tokens
int num_tokens = 1;
for (int i = 0; i < input.dim() - 1; i++) {
num_tokens *= input.size(i);
}

// Verify that the number of tokens matches the size of scales and zero_points
ASSERT_EQ(num_tokens, scales.size());
ASSERT_EQ(num_tokens, zero_points.size());

// Create scale and zero_point tensors
at::Tensor scale_tensor =
at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble));
at::Tensor zero_point_tensor =
at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong));

// Get reference output
at::Tensor reference_out = quantize_per_token_reference_impl(
input, scale_tensor, zero_point_tensor, quant_min, quant_max, dtype);

// Get implementation output
at::Tensor impl_out = torch::executor::native::quantize_per_token_aten(
input, scale_tensor, zero_point_tensor, quant_min, quant_max, dtype);

// Convert to int for consistent display regardless of underlying type
at::Tensor reference_int = reference_out.to(at::kInt);
at::Tensor impl_int = impl_out.to(at::kInt);

const bool output_correct = at::equal(reference_int, impl_out);
if (!output_correct) {
std::cout << "\n"
<< "Failed with parameters: " << std::endl;
std::cout << " scale(s):";
for (size_t i = 0; i < scales.size(); i++) {
std::cout << " " << scales[i] << " ";
}
std::cout << "" << std::endl;
std::cout << " zero_point(s):";
for (size_t i = 0; i < zero_points.size(); i++) {
std::cout << " " << zero_points[i] << " ";
}
std::cout << "" << std::endl;
std::cout << " quant_min: " << quant_min << std::endl;
std::cout << " quant_max: " << quant_max << std::endl;

std::cout << "input:" << std::endl;
std::cout << input << std::endl;
std::cout << "reference:" << std::endl;
std::cout << reference_int << std::endl;
std::cout << "my_reference:" << std::endl;
std::cout << impl_out << std::endl;
}

ASSERT_TRUE(output_correct);
}

void test_vulkan_quantize_per_token_impl(
const std::vector<int>& input_sizes,
const std::vector<float>& scales,
const std::vector<int>& zero_points,
int64_t quant_min,
int64_t quant_max,
at::ScalarType in_dtype = at::kFloat,
at::ScalarType dtype = at::kInt,
const vkcompute::utils::StorageType in_storage =
vkcompute::utils::kTexture3D,
const vkcompute::utils::StorageType out_storage =
vkcompute::utils::kTexture3D) {
check_quantize_args(quant_min, quant_max, dtype);
int num_tokens = 1;
for (int i = 0; i < input_sizes.size() - 1; i++) {
num_tokens *= input_sizes[i];
}

ASSERT_EQ(num_tokens, scales.size());
ASSERT_EQ(num_tokens, zero_points.size());

// Create input tensor with random values
std::vector<int64_t> input_sizes_int64(
input_sizes.begin(), input_sizes.end());
at::Tensor input =
at::rand(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype));
at::Tensor scale_tensor =
at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble));
at::Tensor zero_point_tensor =
at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong));

// Get reference output to show what we would compare against
at::Tensor reference_out = torch::executor::native::quantize_per_token_aten(
input, scale_tensor, zero_point_tensor, quant_min, quant_max, dtype);

using namespace vkcompute;

GraphConfig config;
config.set_storage_type_override(in_storage);
ComputeGraph graph(config);

IOValueRef r_input = graph.add_input_tensor(
input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage);
IOValueRef r_scale = graph.add_input_tensor(
scale_tensor.sizes().vec(), vkapi::kFloat, in_storage);
IOValueRef r_zero_point = graph.add_input_tensor(
zero_point_tensor.sizes().vec(), vkapi::kInt, in_storage);

const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);

const ValueRef r_out = graph.add_tensor(
input.sizes().vec(), from_at_scalartype(dtype), out_storage);

VK_GET_OP_FN("quantize_per_token.default")
(graph,
{
r_input.value,
r_scale.value,
r_zero_point.value,
r_quant_min,
r_quant_max,
r_out,
});

ValueRef staging_out = graph.set_output_tensor(r_out);

graph.prepare();
graph.encode_prepack();
graph.prepack();
graph.encode_execute();

// Copy input data to GPU
graph.copy_into_staging(
r_input.staging, input.const_data_ptr(), input.numel());

// Convert scale tensor to float and copy to GPU
at::Tensor scale_float = scale_tensor.to(at::kFloat);
graph.copy_into_staging(
r_scale.staging, scale_float.const_data_ptr(), scale_float.numel());

// Convert zero_point tensor to int and copy to GPU
at::Tensor zero_point_int = zero_point_tensor.to(at::kInt);
graph.copy_into_staging(
r_zero_point.staging,
zero_point_int.const_data_ptr(),
zero_point_int.numel());

// Execute the graph
graph.execute();

// Copy output data back to CPU
at::Tensor vk_out = at::empty_like(reference_out).contiguous();
graph.copy_from_staging(
staging_out, vk_out.mutable_data_ptr(), vk_out.numel());

// Compare outputs
at::Tensor reference_int = reference_out.to(at::kInt);
at::Tensor vk_int = vk_out.to(at::kInt);

const bool output_correct = at::equal(reference_int, vk_int);
if (!output_correct) {
at::Tensor diffs = at::abs(reference_int - vk_int);

std::cout << "\n"
<< "Failed with parameters: " << std::endl;
std::cout << " scale(s):";
for (size_t i = 0; i < scales.size(); i++) {
std::cout << " " << scales[i] << " ";
}
std::cout << "" << std::endl;
std::cout << " zero_point(s):";
for (size_t i = 0; i < zero_points.size(); i++) {
std::cout << " " << zero_points[i] << " ";
}
std::cout << "" << std::endl;
std::cout << " quant_min: " << quant_min << std::endl;
std::cout << " quant_max: " << quant_max << std::endl;
std::cout << " storage type: "
<< (in_storage == vkcompute::utils::kBuffer ? "buffer"
: "texture")
<< std::endl;

std::cout << "input:" << std::endl;
std::cout << input << std::endl;
std::cout << "reference:" << std::endl;
std::cout << reference_int << std::endl;
std::cout << "vulkan:" << std::endl;
std::cout << vk_int << std::endl;
}

ASSERT_TRUE(output_correct);
}

TEST(
VulkanQuantizePerTensorTest,
test_reference_quantize_per_token_float_to_int8) {
std::vector<float> scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3};
std::vector<int> zero_points = {1, 2, 3, 0, -1, -2};

test_reference_quantize_per_token(
{2, 3, 4}, // input sizes (2*3=6 tokens)
scales,
zero_points,
-128, // quant_min
127, // quant_max
at::kFloat,
at::kChar);
}

TEST(
VulkanQuantizePerTensorTest,
test_reference_quantize_per_token_float_to_int32) {
std::vector<float> scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3};
std::vector<int> zero_points = {1, 2, 3, 0, -1, -2};

test_reference_quantize_per_token(
{2, 3, 4}, // input sizes (2*3=6 tokens)
scales,
zero_points,
std::numeric_limits<int32_t>::min(), // quant_min
std::numeric_limits<int32_t>::max(), // quant_max
at::kFloat,
at::kInt);
}

TEST(
VulkanQuantizePerTensorTest,
test_reference_quantize_per_token_half_to_int32) {
std::vector<float> scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3};
std::vector<int> zero_points = {1, 2, 3, 0, -1, -2};

test_reference_quantize_per_token(
{2, 3, 4}, // input sizes (2*3=6 tokens)
scales,
zero_points,
std::numeric_limits<int32_t>::min(), // quant_min
std::numeric_limits<int32_t>::max(), // quant_max
at::kHalf,
at::kInt);
}

TEST(
VulkanQuantizePerTensorTest,
test_reference_quantize_per_token_half_to_uint8) {
std::vector<float> scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3};
std::vector<int> zero_points = {1, 2, 3, 0, -1, -2};

test_reference_quantize_per_token(
{2, 3, 4}, // input sizes (2*3=6 tokens)
scales,
zero_points,
0, // quant_min
255, // quant_max
at::kHalf,
at::kByte);
}
Loading