diff --git a/backends/vulkan/test/op_tests/quantize_test.cpp b/backends/vulkan/test/op_tests/quantize_test.cpp index 6ff61dac19b..0ac08b65972 100644 --- a/backends/vulkan/test/op_tests/quantize_test.cpp +++ b/backends/vulkan/test/op_tests/quantize_test.cpp @@ -156,3 +156,387 @@ void check_quantize_args( " actual quant_max: ", quant_max); } +/* + * Reference implementation of quantize_per_token + */ +at::Tensor quantize_per_token_reference_impl( + const at::Tensor& input, + const at::Tensor& scale, + const at::Tensor& zero_point, + int64_t quant_min, + int64_t quant_max, + at::ScalarType dtype) { + // Create output tensor with the target dtype + at::Tensor out = at::empty_like(input, dtype); + + // Calculate number of tokens + int num_tokens = 1; + for (int i = 0; i < input.dim() - 1; i++) { + num_tokens *= input.size(i); + } + + // Verify that the number of tokens matches the size of scale and zero_point + // tensors + assert(num_tokens == scale.numel()); + assert(num_tokens == zero_point.numel()); + + // Reshape input to [num_tokens, last_dim] + at::Tensor reshaped_input = input.reshape({num_tokens, input.size(-1)}); + at::Tensor reshaped_out = out.reshape({num_tokens, input.size(-1)}); + + // Quantize each token separately + for (int token_idx = 0; token_idx < num_tokens; token_idx++) { + // Use float for scale since Vulkan doesn't support double + float token_scale = scale[token_idx].item(); + // Use int for zero_point since Vulkan doesn't support int64_t + int token_zero_point = zero_point[token_idx].item(); + + float inv_scale = 1.0 / token_scale; + + // Quantize the token + for (int i = 0; i < input.size(-1); i++) { + float value = reshaped_input[token_idx][i].item(); + int qvalue = token_zero_point + std::nearbyint(inv_scale * value); + + qvalue = std::max(qvalue, quant_min); + qvalue = std::min(qvalue, quant_max); + + if (dtype == at::kByte) { + reshaped_out[token_idx][i] = static_cast(qvalue); + } else if (dtype == at::kChar) { + reshaped_out[token_idx][i] = static_cast(qvalue); + } else if (dtype == at::kShort) { + reshaped_out[token_idx][i] = static_cast(qvalue); + } else if (dtype == at::kInt) { + reshaped_out[token_idx][i] = static_cast(qvalue); + } else if (dtype == at::kLong) { + reshaped_out[token_idx][i] = static_cast(qvalue); + } + } + } + + return out; +} + +void test_vulkan_quantize_per_token_impl( + const std::vector& input_sizes, + const std::vector& scales, + const std::vector& zero_points, + int64_t quant_min, + int64_t quant_max, + at::ScalarType in_dtype, + at::ScalarType dtype, + const vkcompute::utils::StorageType in_storage, + const vkcompute::utils::StorageType out_storage); + +// Wrapper function to test both buffer and texture storage types +void test_vulkan_quantize_per_token( + const std::vector& input_sizes, + const std::vector& scales, + const std::vector& zero_points, + int64_t quant_min, + int64_t quant_max, + at::ScalarType in_dtype = at::kFloat, + at::ScalarType dtype = at::kInt) { + // Test with buffer storage + test_vulkan_quantize_per_token_impl( + input_sizes, + scales, + zero_points, + quant_min, + quant_max, + in_dtype, + dtype, + vkcompute::utils::kBuffer, + vkcompute::utils::kBuffer); + + // Test with texture storage + test_vulkan_quantize_per_token_impl( + input_sizes, + scales, + zero_points, + quant_min, + quant_max, + in_dtype, + dtype, + vkcompute::utils::kTexture3D, + vkcompute::utils::kTexture3D); +} + +void test_reference_quantize_per_token( + const std::vector& input_sizes, + const std::vector& scales, + const std::vector& zero_points, + int64_t quant_min, + int64_t quant_max, + at::ScalarType in_dtype = at::kFloat, + at::ScalarType dtype = at::kInt) { + check_quantize_args(quant_min, quant_max, dtype); + std::vector input_sizes_int64( + input_sizes.begin(), input_sizes.end()); + at::Tensor input = + at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype)); + + // Fill with a simple pattern: values from 0 to 1 in steps + float step = 1.0 / (input.numel() - 1); + auto flat_input = input.flatten(); + for (int i = 0; i < flat_input.numel(); i++) { + flat_input[i] = i * step; + } + + // Reshape back to original dimensions + input = flat_input.reshape(input_sizes_int64); + + // Calculate number of tokens + int num_tokens = 1; + for (int i = 0; i < input.dim() - 1; i++) { + num_tokens *= input.size(i); + } + + // Verify that the number of tokens matches the size of scales and zero_points + ASSERT_EQ(num_tokens, scales.size()); + ASSERT_EQ(num_tokens, zero_points.size()); + + // Create scale and zero_point tensors + at::Tensor scale_tensor = + at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble)); + at::Tensor zero_point_tensor = + at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong)); + + // Get reference output + at::Tensor reference_out = quantize_per_token_reference_impl( + input, scale_tensor, zero_point_tensor, quant_min, quant_max, dtype); + + // Get implementation output + at::Tensor impl_out = torch::executor::native::quantize_per_token_aten( + input, scale_tensor, zero_point_tensor, quant_min, quant_max, dtype); + + // Convert to int for consistent display regardless of underlying type + at::Tensor reference_int = reference_out.to(at::kInt); + at::Tensor impl_int = impl_out.to(at::kInt); + + const bool output_correct = at::equal(reference_int, impl_out); + if (!output_correct) { + std::cout << "\n" + << "Failed with parameters: " << std::endl; + std::cout << " scale(s):"; + for (size_t i = 0; i < scales.size(); i++) { + std::cout << " " << scales[i] << " "; + } + std::cout << "" << std::endl; + std::cout << " zero_point(s):"; + for (size_t i = 0; i < zero_points.size(); i++) { + std::cout << " " << zero_points[i] << " "; + } + std::cout << "" << std::endl; + std::cout << " quant_min: " << quant_min << std::endl; + std::cout << " quant_max: " << quant_max << std::endl; + + std::cout << "input:" << std::endl; + std::cout << input << std::endl; + std::cout << "reference:" << std::endl; + std::cout << reference_int << std::endl; + std::cout << "my_reference:" << std::endl; + std::cout << impl_out << std::endl; + } + + ASSERT_TRUE(output_correct); +} + +void test_vulkan_quantize_per_token_impl( + const std::vector& input_sizes, + const std::vector& scales, + const std::vector& zero_points, + int64_t quant_min, + int64_t quant_max, + at::ScalarType in_dtype = at::kFloat, + at::ScalarType dtype = at::kInt, + const vkcompute::utils::StorageType in_storage = + vkcompute::utils::kTexture3D, + const vkcompute::utils::StorageType out_storage = + vkcompute::utils::kTexture3D) { + check_quantize_args(quant_min, quant_max, dtype); + int num_tokens = 1; + for (int i = 0; i < input_sizes.size() - 1; i++) { + num_tokens *= input_sizes[i]; + } + + ASSERT_EQ(num_tokens, scales.size()); + ASSERT_EQ(num_tokens, zero_points.size()); + + // Create input tensor with random values + std::vector input_sizes_int64( + input_sizes.begin(), input_sizes.end()); + at::Tensor input = + at::rand(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype)); + at::Tensor scale_tensor = + at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble)); + at::Tensor zero_point_tensor = + at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong)); + + // Get reference output to show what we would compare against + at::Tensor reference_out = torch::executor::native::quantize_per_token_aten( + input, scale_tensor, zero_point_tensor, quant_min, quant_max, dtype); + + using namespace vkcompute; + + GraphConfig config; + config.set_storage_type_override(in_storage); + ComputeGraph graph(config); + + IOValueRef r_input = graph.add_input_tensor( + input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage); + IOValueRef r_scale = graph.add_input_tensor( + scale_tensor.sizes().vec(), vkapi::kFloat, in_storage); + IOValueRef r_zero_point = graph.add_input_tensor( + zero_point_tensor.sizes().vec(), vkapi::kInt, in_storage); + + const ValueRef r_quant_min = graph.add_scalar(quant_min); + const ValueRef r_quant_max = graph.add_scalar(quant_max); + + const ValueRef r_out = graph.add_tensor( + input.sizes().vec(), from_at_scalartype(dtype), out_storage); + + VK_GET_OP_FN("quantize_per_token.default") + (graph, + { + r_input.value, + r_scale.value, + r_zero_point.value, + r_quant_min, + r_quant_max, + r_out, + }); + + ValueRef staging_out = graph.set_output_tensor(r_out); + + graph.prepare(); + graph.encode_prepack(); + graph.prepack(); + graph.encode_execute(); + + // Copy input data to GPU + graph.copy_into_staging( + r_input.staging, input.const_data_ptr(), input.numel()); + + // Convert scale tensor to float and copy to GPU + at::Tensor scale_float = scale_tensor.to(at::kFloat); + graph.copy_into_staging( + r_scale.staging, scale_float.const_data_ptr(), scale_float.numel()); + + // Convert zero_point tensor to int and copy to GPU + at::Tensor zero_point_int = zero_point_tensor.to(at::kInt); + graph.copy_into_staging( + r_zero_point.staging, + zero_point_int.const_data_ptr(), + zero_point_int.numel()); + + // Execute the graph + graph.execute(); + + // Copy output data back to CPU + at::Tensor vk_out = at::empty_like(reference_out).contiguous(); + graph.copy_from_staging( + staging_out, vk_out.mutable_data_ptr(), vk_out.numel()); + + // Compare outputs + at::Tensor reference_int = reference_out.to(at::kInt); + at::Tensor vk_int = vk_out.to(at::kInt); + + const bool output_correct = at::equal(reference_int, vk_int); + if (!output_correct) { + at::Tensor diffs = at::abs(reference_int - vk_int); + + std::cout << "\n" + << "Failed with parameters: " << std::endl; + std::cout << " scale(s):"; + for (size_t i = 0; i < scales.size(); i++) { + std::cout << " " << scales[i] << " "; + } + std::cout << "" << std::endl; + std::cout << " zero_point(s):"; + for (size_t i = 0; i < zero_points.size(); i++) { + std::cout << " " << zero_points[i] << " "; + } + std::cout << "" << std::endl; + std::cout << " quant_min: " << quant_min << std::endl; + std::cout << " quant_max: " << quant_max << std::endl; + std::cout << " storage type: " + << (in_storage == vkcompute::utils::kBuffer ? "buffer" + : "texture") + << std::endl; + + std::cout << "input:" << std::endl; + std::cout << input << std::endl; + std::cout << "reference:" << std::endl; + std::cout << reference_int << std::endl; + std::cout << "vulkan:" << std::endl; + std::cout << vk_int << std::endl; + } + + ASSERT_TRUE(output_correct); +} + +TEST( + VulkanQuantizePerTensorTest, + test_reference_quantize_per_token_float_to_int8) { + std::vector scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3}; + std::vector zero_points = {1, 2, 3, 0, -1, -2}; + + test_reference_quantize_per_token( + {2, 3, 4}, // input sizes (2*3=6 tokens) + scales, + zero_points, + -128, // quant_min + 127, // quant_max + at::kFloat, + at::kChar); +} + +TEST( + VulkanQuantizePerTensorTest, + test_reference_quantize_per_token_float_to_int32) { + std::vector scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3}; + std::vector zero_points = {1, 2, 3, 0, -1, -2}; + + test_reference_quantize_per_token( + {2, 3, 4}, // input sizes (2*3=6 tokens) + scales, + zero_points, + std::numeric_limits::min(), // quant_min + std::numeric_limits::max(), // quant_max + at::kFloat, + at::kInt); +} + +TEST( + VulkanQuantizePerTensorTest, + test_reference_quantize_per_token_half_to_int32) { + std::vector scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3}; + std::vector zero_points = {1, 2, 3, 0, -1, -2}; + + test_reference_quantize_per_token( + {2, 3, 4}, // input sizes (2*3=6 tokens) + scales, + zero_points, + std::numeric_limits::min(), // quant_min + std::numeric_limits::max(), // quant_max + at::kHalf, + at::kInt); +} + +TEST( + VulkanQuantizePerTensorTest, + test_reference_quantize_per_token_half_to_uint8) { + std::vector scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3}; + std::vector zero_points = {1, 2, 3, 0, -1, -2}; + + test_reference_quantize_per_token( + {2, 3, 4}, // input sizes (2*3=6 tokens) + scales, + zero_points, + 0, // quant_min + 255, // quant_max + at::kHalf, + at::kByte); +}