From a7acf134043ce4543e0f68adcf749f3af33a1977 Mon Sep 17 00:00:00 2001
From: morelos <morelos@devvm4573.ash0.facebook.com>
Date: Fri, 13 Jun 2025 15:49:28 -0700
Subject: [PATCH] [ET-VK][Ops] quantize_per_tensor.default test setup

Pull Request resolved: https://github.com/pytorch/executorch/pull/11368

# Context
In order to enhance my own understanding of these operators, I needed to create a reference implementation, and also build out the vulkan testing framework which creates the necessary build up when I need to call the vulkan implementations. I won't explain what the quantize operator actually is in this diff, but will rather opt to explain the operator in a future diff where I implement the glsl shader, however, the reference implementation is heavily inspired by the cpu implementation and aims to create similar checks when calculating the tokens and performing the quantization with the given scales and zero points.

This diff is the per_tensor reference implementation.

# Changes
The main changes were the include of the reference implementation that is used for my own learning, and the necessary wrapper functions that will be called later when the vulkan implementation is successfully completed. It has everything necessary for this purpose, including calling the operator by its appropriate name as when defined in the C++ implementation header, and staging components correctly from the GPU and then the CPU which will be where the comparison is done. I have also included comprehensive failure print statements that prints the tensor size along with relevant parameters such as the zero points or scales passed in, and even the min and max for quantization.

This is for the per_tensor implementation.
ghstack-source-id: 290376496
@exported-using-ghexport

Differential Revision: [D75959065](https://our.internmc.facebook.com/intern/diff/D75959065/)
---
 .../vulkan/test/op_tests/quantize_test.cpp    | 301 ++++++++++++++++++
 1 file changed, 301 insertions(+)
diff --git a/backends/vulkan/test/op_tests/quantize_test.cpp b/backends/vulkan/test/op_tests/quantize_test.cpp
index 0ac08b65972..8b79dc1ce6b 100644
--- a/backends/vulkan/test/op_tests/quantize_test.cpp
+++ b/backends/vulkan/test/op_tests/quantize_test.cpp
@@ -156,6 +156,56 @@ void check_quantize_args(
       " actual quant_max: ",
       quant_max);
 }
+
+//
+// Reference Implementation
+//
+
+/*
+ * Reference implementation of quantize_per_tensor
+ */
+at::Tensor quantize_per_tensor_reference_impl(
+    const at::Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType dtype) {
+  // Create output tensor with the target dtype
+  at::Tensor out = at::empty_like(input, dtype);
+
+  // Quantize the input tensor
+  float inv_scale = 1.0 / scale;
+
+  // Iterate through the tensor and quantize each element
+  at::Tensor float_input = input.to(at::kFloat);
+  at::Tensor float_values = float_input.flatten();
+
+  auto out_flat = out.flatten();
+
+  for (int i = 0; i < float_values.numel(); i++) {
+    float value = float_values[i].item<float>();
+    int64_t qvalue = zero_point + std::nearbyint(inv_scale * value);
+
+    qvalue = std::max<int64_t>(qvalue, quant_min);
+    qvalue = std::min<int64_t>(qvalue, quant_max);
+
+    if (dtype == at::kByte) {
+      out_flat[i] = static_cast<uint8_t>(qvalue);
+    } else if (dtype == at::kChar) {
+      out_flat[i] = static_cast<int8_t>(qvalue);
+    } else if (dtype == at::kShort) {
+      out_flat[i] = static_cast<int16_t>(qvalue);
+    } else if (dtype == at::kInt) {
+      out_flat[i] = static_cast<int32_t>(qvalue);
+    } else if (dtype == at::kLong) {
+      out_flat[i] = static_cast<int64_t>(qvalue);
+    }
+  }
+
+  return out.reshape(input.sizes());
+}
+
 /*
  * Reference implementation of quantize_per_token
  */
@@ -218,6 +268,18 @@ at::Tensor quantize_per_token_reference_impl(
   return out;
 }
 
+// Forward declaration of implementation functions
+void test_vulkan_quantize_per_tensor_impl(
+    const std::vector<int>& input_sizes,
+    float scale,
+    int zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType in_dtype,
+    at::ScalarType dtype,
+    const vkcompute::utils::StorageType in_storage,
+    const vkcompute::utils::StorageType out_storage);
+
 void test_vulkan_quantize_per_token_impl(
     const std::vector<int>& input_sizes,
     const std::vector<float>& scales,
@@ -229,6 +291,40 @@ void test_vulkan_quantize_per_token_impl(
     const vkcompute::utils::StorageType in_storage,
     const vkcompute::utils::StorageType out_storage);
 
+// Wrapper function to test both buffer and texture storage types
+void test_vulkan_quantize_per_tensor(
+    const std::vector<int>& input_sizes,
+    float scale,
+    int zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType in_dtype = at::kFloat,
+    at::ScalarType dtype = at::kInt) {
+  // Test with buffer storage
+  test_vulkan_quantize_per_tensor_impl(
+      input_sizes,
+      scale,
+      zero_point,
+      quant_min,
+      quant_max,
+      in_dtype,
+      dtype,
+      vkcompute::utils::kBuffer,
+      vkcompute::utils::kBuffer);
+
+  // Test with texture storage
+  test_vulkan_quantize_per_tensor_impl(
+      input_sizes,
+      scale,
+      zero_point,
+      quant_min,
+      quant_max,
+      in_dtype,
+      dtype,
+      vkcompute::utils::kTexture3D,
+      vkcompute::utils::kTexture3D);
+}
+
 // Wrapper function to test both buffer and texture storage types
 void test_vulkan_quantize_per_token(
     const std::vector<int>& input_sizes,
@@ -263,6 +359,211 @@ void test_vulkan_quantize_per_token(
       vkcompute::utils::kTexture3D);
 }
 
+void test_reference_quantize_per_tensor(
+    const std::vector<int>& input_sizes,
+    float scale,
+    int zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType in_dtype = at::kFloat,
+    at::ScalarType dtype = at::kInt) {
+  check_quantize_args(quant_min, quant_max, dtype);
+  std::vector<int64_t> input_sizes_int64(
+      input_sizes.begin(), input_sizes.end());
+  at::Tensor input =
+      at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype));
+
+  // Fill with a simple pattern: values from 0 to 1 in steps
+  float step = 1.0f / (input.numel() - 1);
+  auto flat_input = input.flatten();
+  for (int i = 0; i < flat_input.numel(); i++) {
+    flat_input[i] = i * step;
+  }
+
+  // Reshape back to original dimensions
+  input = flat_input.reshape(input_sizes_int64);
+
+  // Get reference output
+  at::Tensor reference_out = quantize_per_tensor_reference_impl(
+      input, scale, zero_point, quant_min, quant_max, dtype);
+
+  // Get implementation output
+  at::Tensor impl_out = torch::executor::native::quantize_per_tensor_aten(
+      input, scale, zero_point, quant_min, quant_max, dtype);
+
+  // Convert to int for consistent display regardless of underlying type
+  at::Tensor reference_int = reference_out.to(at::kInt);
+  at::Tensor impl_int = impl_out.to(at::kInt);
+
+  const bool output_correct = at::equal(reference_int, impl_int);
+  if (!output_correct) {
+    at::Tensor diffs = at::abs(reference_int - impl_int);
+
+    std::cout << "\n"
+              << "Failed with parameters: " << std::endl;
+    std::cout << "  scale: " << scale << std::endl;
+    std::cout << "  zero_point: " << zero_point << std::endl;
+    std::cout << "  quant_min: " << quant_min << std::endl;
+    std::cout << "  quant_max: " << quant_max << std::endl;
+
+    std::cout << "input:" << std::endl;
+    std::cout << input << std::endl;
+    std::cout << "reference:" << std::endl;
+    std::cout << reference_int << std::endl;
+    std::cout << "my_reference:" << std::endl;
+    std::cout << impl_int << std::endl;
+  }
+
+  ASSERT_TRUE(output_correct);
+}
+
+void test_vulkan_quantize_per_tensor_impl(
+    const std::vector<int>& input_sizes,
+    float scale,
+    int zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    at::ScalarType in_dtype = at::kFloat,
+    at::ScalarType dtype = at::kInt,
+    const vkcompute::utils::StorageType in_storage =
+        vkcompute::utils::kTexture3D,
+    const vkcompute::utils::StorageType out_storage =
+        vkcompute::utils::kTexture3D) {
+  check_quantize_args(quant_min, quant_max, dtype);
+  std::vector<int64_t> input_sizes_int64(
+      input_sizes.begin(), input_sizes.end());
+  at::Tensor input =
+      at::rand(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype));
+
+  // Get reference output
+  at::Tensor reference_out = torch::executor::native::quantize_per_tensor_aten(
+      input, scale, zero_point, quant_min, quant_max, dtype);
+
+  // Build Vulkan quantize_per_tensor graph
+  using namespace vkcompute;
+
+  GraphConfig config;
+  config.set_storage_type_override(in_storage);
+  ComputeGraph graph(config);
+
+  IOValueRef r_input = graph.add_input_tensor(
+      input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage);
+
+  const ValueRef r_scale = graph.add_scalar<double>(scale);
+  const ValueRef r_zero_point = graph.add_scalar<int64_t>(zero_point);
+  const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
+  const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);
+
+  const ValueRef r_out = graph.add_tensor(
+      input.sizes().vec(), from_at_scalartype(dtype), out_storage);
+
+  VK_GET_OP_FN("quantize_per_tensor.default")
+  (graph,
+   {
+       r_input.value,
+       r_scale,
+       r_zero_point,
+       r_quant_min,
+       r_quant_max,
+       r_out,
+   });
+
+  ValueRef staging_out = graph.set_output_tensor(r_out);
+
+  graph.prepare();
+  graph.encode_prepack();
+  graph.prepack();
+  graph.encode_execute();
+
+  // Run Vulkan quantize_per_tensor
+  graph.copy_into_staging(
+      r_input.staging, input.const_data_ptr(), input.numel());
+
+  graph.execute();
+
+  at::Tensor vk_out = at::empty_like(reference_out).contiguous();
+  graph.copy_from_staging(
+      staging_out, vk_out.mutable_data_ptr(), vk_out.numel());
+
+  // Compare outputs
+  // For quantized types, we need to compare the actual integer values
+  at::Tensor reference_int = reference_out.to(at::kInt);
+  at::Tensor vk_int = vk_out.to(at::kInt);
+
+  const bool output_correct = at::equal(reference_int, vk_int);
+  if (!output_correct) {
+    at::Tensor diffs = at::abs(reference_int - vk_int);
+
+    std::cout << "\n"
+              << "Failed with parameters: " << std::endl;
+    std::cout << "  scale: " << scale << std::endl;
+    std::cout << "  zero_point: " << zero_point << std::endl;
+    std::cout << "  quant_min: " << quant_min << std::endl;
+    std::cout << "  quant_max: " << quant_max << std::endl;
+
+    std::cout << "input:" << std::endl;
+    std::cout << input << std::endl;
+    std::cout << "reference:" << std::endl;
+    std::cout << reference_int << std::endl;
+    std::cout << "vulkan:" << std::endl;
+    std::cout << vk_int << std::endl;
+  }
+
+  ASSERT_TRUE(output_correct);
+}
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_reference_quantize_per_tensor_float_to_int8) {
+  test_reference_quantize_per_tensor(
+      {2, 3, 4}, // input sizes
+      0.1, // scale
+      0, // zero_point
+      -128, // quant_min
+      127, // quant_max
+      at::kFloat,
+      at::kChar);
+}
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_reference_quantize_per_tensor_float_to_int32) {
+  test_reference_quantize_per_tensor(
+      {2, 3, 4}, // input sizes
+      0.04, // scale
+      5, // zero_point
+      std::numeric_limits<int32_t>::min(), // quant_min
+      std::numeric_limits<int32_t>::max(), // quant_max
+      at::kFloat,
+      at::kInt);
+}
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_reference_quantize_per_tensor_half_to_uint8) {
+  test_reference_quantize_per_tensor(
+      {2, 3, 4}, // input sizes
+      0.2, // scale
+      2, // zero_point
+      0, // quant_min
+      255, // quant_max
+      at::kHalf,
+      at::kByte);
+}
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_reference_quantize_per_tensor_half_to_int32) {
+  test_reference_quantize_per_tensor(
+      {2, 3, 4}, // input sizes
+      0.01, // scale
+      1, // zero_point
+      std::numeric_limits<int32_t>::min(), // quant_min
+      std::numeric_limits<int32_t>::max(), // quant_max
+      at::kHalf,
+      at::kInt);
+}
+
 void test_reference_quantize_per_token(
     const std::vector<int>& input_sizes,
     const std::vector<float>& scales,