[ET-VK] Re-implement (de)quantize_per_tensor.default (#15753)

pytorchbot · web-flow · commit b005f10fe396 · 2025-11-11T14:34:06.000-05:00
Re-implement the `quantized_decomposed.(de)quantize_per_tensor.default` ops with `add_quantize_and_pack_4w4c_node` As a consequence, the `et_vk.quantize_q8ta_for_conv2d.default` and `et_vk.dequantize_q8to_from_conv2d.default` ops are not needed anymore. The overall goal is to streamline the quantize/dequantize interface in ET-VK. Differential Revision: [D86702457](https://our.internmc.facebook.com/intern/diff/D86702457/)
diff --git a/backends/vulkan/_passes/TARGETS b/backends/vulkan/_passes/TARGETS
@@ -104,19 +104,6 @@ runtime.python_library(
     ],
 )
 
-runtime.python_library(
-    name = "replace_qdq",
-    srcs = ["replace_qdq.py"],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/backends/vulkan:utils_lib",
-        "//executorch/exir:pass_base",
-    ],
-)
-
 runtime.python_library(
     name = "fuse_patterns",
     srcs = ["fuse_patterns.py"],
@@ -149,7 +136,6 @@ runtime.python_library(
         ":insert_prepack_nodes",
         ":remove_asserts",
         ":remove_redundant_ops",
-        ":replace_qdq",
         ":squeeze_unsqueeze_inputs",
         ":tag_memory_meta_pass",
     ]
diff --git a/backends/vulkan/_passes/__init__.py b/backends/vulkan/_passes/__init__.py
@@ -19,7 +19,6 @@
 from executorch.backends.vulkan._passes.remove_redundant_ops import (
     RemoveRedundantOpsTransform,
 )
-from executorch.backends.vulkan._passes.replace_qdq import ReplaceQDQPass
 from executorch.backends.vulkan._passes.squeeze_unsqueeze_inputs import (
     SqueezeUnsqueezeInputs,
 )
@@ -33,7 +32,6 @@
     "remove_asserts",
     "RemoveAssertsTransform",
     "RemoveRedundantOpsTransform",
-    "ReplaceQDQPass",
     "SqueezeUnsqueezeInputs",
     "TagMemoryMetaPass",
 ]
diff --git a/backends/vulkan/_passes/replace_qdq.py b/backends/vulkan/_passes/replace_qdq.py
diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py
@@ -539,42 +539,6 @@ def apply_rotary_emb_impl(
 lib.impl(name, apply_rotary_emb_impl, "CompositeExplicitAutograd")
 apply_rotary_emb_op = getattr(getattr(torch.ops, namespace), name)
 
-#############################
-## quantize/dequantize ops ##
-#############################
-
-
-def quantize_q8ta_for_conv2d_impl(
-    input: torch.Tensor,
-    scale: float,
-    zero_point: int,
-):
-    return torch.ops.quantized_decomposed.quantize_per_tensor(
-        input, scale, zero_point, -128, 127, torch.int8
-    )
-
-
-name = "quantize_q8ta_for_conv2d"
-lib.define(f"{name}(Tensor input, float scale, int zero_point) -> Tensor")
-lib.impl(name, quantize_q8ta_for_conv2d_impl, "CompositeExplicitAutograd")
-quantize_q8ta_for_conv2d_op = getattr(getattr(torch.ops, namespace), name)
-
-
-def dequantize_q8to_from_conv2d_impl(
-    input: torch.Tensor,
-    scale: float,
-    zero_point: int,
-):
-    return torch.ops.quantized_decomposed.dequantize_per_tensor(
-        input, scale, zero_point, -128, 127, input.dtype
-    )
-
-
-name = "dequantize_q8to_from_conv2d"
-lib.define(f"{name}(Tensor input, float scale, int zero_point) -> Tensor")
-lib.impl(name, dequantize_q8to_from_conv2d_impl, "CompositeExplicitAutograd")
-dequantize_q8to_from_conv2d_op = getattr(getattr(torch.ops, namespace), name)
-
 ########################
 ## add_q8ta_q8ta_q8to ##
 ########################
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -144,13 +144,9 @@ def register_ephemeral_op():
 
 @update_features(
     [
-        exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
-        exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
         exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
-        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
-        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
-        exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
         exir_ops.edge.quantized_decomposed.quantize_per_token.default,
+        exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
         exir_ops.edge.quantized_decomposed.dequantize_per_token.default,
     ]
 )
@@ -630,35 +626,35 @@ def register_quantized_binary_op():
 
 @update_features(
     [
-        exir_ops.edge.et_vk.quantize_q8ta_for_conv2d.default,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
     ]
 )
-def register_quantize_for_conv2d_op():
+def register_quantize_op():
     return OpFeatures(
         inputs_storage=[
             utils.CHANNELS_PACKED_TEXTURE_OR_CONTIGUOUS_BUFFER,
         ],
         outputs_storage=[
             utils.PACKED_INT8_4W4C_BUFFER,
         ],
-        supports_resize=False,
     )
 
 
 @update_features(
     [
-        exir_ops.edge.et_vk.dequantize_q8to_from_conv2d.default,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
     ]
 )
-def register_dequantize_for_conv2d_op():
+def register_dequantize_op():
     return OpFeatures(
         inputs_storage=[
             utils.PACKED_INT8_4W4C_BUFFER,
         ],
         outputs_storage=[
             utils.CHANNELS_PACKED_TEXTURE_OR_CONTIGUOUS_BUFFER,
         ],
-        supports_resize=False,
     )
 
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizeDequantize.cpp
@@ -366,30 +366,52 @@ void add_unpack_4w4c_and_dequantize_node(
 // Operator Entrypoints
 //
 
-void quantize_q8ta_for_conv2d(
+void quantize_per_tensor_impl(
     ComputeGraph& graph,
     const std::vector<ValueRef>& args) {
-  int32_t idx = 0;
-  const ValueRef fp_input = args.at(idx++);
-  const ValueRef scale = args.at(idx++);
-  const ValueRef zero_point = args.at(idx++);
-  const ValueRef packed_int8_input = args.at(idx++);
+  int32_t arg_idx = 0;
+  const ValueRef fp_input = args[arg_idx++];
+  const ValueRef scale = args[arg_idx++];
+  const ValueRef zero_point = args[arg_idx++];
+  const ValueRef quant_min = args[arg_idx++];
+  (void)quant_min;
+  const ValueRef quant_max = args[arg_idx++];
+  (void)quant_max;
+  const ValueRef dtype = args[arg_idx++];
+  (void)dtype;
+
+  const ValueRef int8_output = args[arg_idx++];
+
+  VK_CHECK_COND(
+      graph.estimate_memory_layout_of(int8_output) == utils::kPackedInt8_4W4C);
 
   add_quantize_and_pack_4w4c_node(
-      graph, fp_input, scale, zero_point, packed_int8_input);
+      graph, fp_input, scale, zero_point, int8_output);
 }
 
-void dequantize_q8to_from_conv2d(
+void dequantize_per_tensor_impl(
     ComputeGraph& graph,
     const std::vector<ValueRef>& args) {
-  int32_t idx = 0;
-  const ValueRef packed_int8_output = args.at(idx++);
-  const ValueRef scale = args.at(idx++);
-  const ValueRef zero_point = args.at(idx++);
-  const ValueRef fp_output = args.at(idx++);
+  int32_t arg_idx = 0;
+  const ValueRef int8_input = args[arg_idx++];
+  const ValueRef scale = args[arg_idx++];
+  const ValueRef zero_point = args[arg_idx++];
+  const ValueRef quant_min = args[arg_idx++];
+  (void)quant_min;
+  const ValueRef quant_max = args[arg_idx++];
+  (void)quant_max;
+  const ValueRef dtype = args[arg_idx++];
+  (void)dtype;
+  const ValueRef output_dtype = args[arg_idx++];
+  (void)output_dtype;
+
+  const ValueRef fp_output = args[arg_idx++];
+
+  VK_CHECK_COND(
+      graph.estimate_memory_layout_of(int8_input) == utils::kPackedInt8_4W4C);
 
   add_unpack_4w4c_and_dequantize_node(
-      graph, packed_int8_output, scale, zero_point, fp_output);
+      graph, int8_input, scale, zero_point, fp_output);
 }
 
 void qdq8ta_conv2d_input(
@@ -416,11 +438,13 @@ void qdq8ta_conv2d_input(
 }
 
 REGISTER_OPERATORS {
-  VK_REGISTER_OP(etvk.qdq8ta_conv2d_input.default, qdq8ta_conv2d_input);
   VK_REGISTER_OP(
-      et_vk.quantize_q8ta_for_conv2d.default, quantize_q8ta_for_conv2d);
+      quantized_decomposed.quantize_per_tensor.default,
+      quantize_per_tensor_impl);
   VK_REGISTER_OP(
-      et_vk.dequantize_q8to_from_conv2d.default, dequantize_q8to_from_conv2d);
+      quantized_decomposed.dequantize_per_tensor.default,
+      dequantize_per_tensor_impl);
+  VK_REGISTER_OP(etvk.qdq8ta_conv2d_input.default, qdq8ta_conv2d_input);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
@@ -21,7 +21,6 @@
     FuseQuantizedOpsTransform,
     insert_prepack_nodes,
     RemoveRedundantOpsTransform,
-    ReplaceQDQPass,
     SqueezeUnsqueezeInputs,
     TagMemoryMetaPass,
 )
@@ -162,7 +161,6 @@ def preprocess(  # noqa: C901
                 AddmmToLinearTransform(),
                 RemoveRedundantOpsTransform(),
                 FuseQuantizedOpsTransform(),
-                ReplaceQDQPass(),
                 FoldQDQPass(),
                 SqueezeUnsqueezeInputs(),
                 FuseViewCopyTransform(),

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,6 @@`
`19`	`19`	`from executorch.backends.vulkan._passes.remove_redundant_ops import (`
`20`	`20`	`RemoveRedundantOpsTransform,`
`21`	`21`	`)`
`22`		`-from executorch.backends.vulkan._passes.replace_qdq import ReplaceQDQPass`
`23`	`22`	`from executorch.backends.vulkan._passes.squeeze_unsqueeze_inputs import (`
`24`	`23`	`SqueezeUnsqueezeInputs,`
`25`	`24`	`)`
`@@ -33,7 +32,6 @@`
`33`	`32`	`"remove_asserts",`
`34`	`33`	`"RemoveAssertsTransform",`
`35`	`34`	`"RemoveRedundantOpsTransform",`
`36`		`- "ReplaceQDQPass",`
`37`	`35`	`"SqueezeUnsqueezeInputs",`
`38`	`36`	`"TagMemoryMetaPass",`
`39`	`37`	`]`
Original file line number	Diff line number	Diff line change
`@@ -144,13 +144,9 @@ def register_ephemeral_op():`
`144`	`144`
`145`	`145`	`@update_features(`
`146`	`146`	`[`
`147`		`- exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,`
`148`		`- exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,`
`149`	`147`	`exir_ops.edge.quantized_decomposed.quantize_per_channel.default,`
`150`		`- exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,`
`151`		`- exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,`
`152`		`- exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,`
`153`	`148`	`exir_ops.edge.quantized_decomposed.quantize_per_token.default,`
	`149`	`+ exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,`
`154`	`150`	`exir_ops.edge.quantized_decomposed.dequantize_per_token.default,`
`155`	`151`	`]`
`156`	`152`	`)`
`@@ -630,35 +626,35 @@ def register_quantized_binary_op():`
`630`	`626`
`631`	`627`	`@update_features(`
`632`	`628`	`[`
`633`		`- exir_ops.edge.et_vk.quantize_q8ta_for_conv2d.default,`
	`629`	`+ exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,`
	`630`	`+ exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,`
`634`	`631`	`]`
`635`	`632`	`)`
`636`		`-def register_quantize_for_conv2d_op():`
	`633`	`+def register_quantize_op():`
`637`	`634`	`return OpFeatures(`
`638`	`635`	`inputs_storage=[`
`639`	`636`	`utils.CHANNELS_PACKED_TEXTURE_OR_CONTIGUOUS_BUFFER,`
`640`	`637`	`],`
`641`	`638`	`outputs_storage=[`
`642`	`639`	`utils.PACKED_INT8_4W4C_BUFFER,`
`643`	`640`	`],`
`644`		`- supports_resize=False,`
`645`	`641`	`)`
`646`	`642`
`647`	`643`
`648`	`644`	`@update_features(`
`649`	`645`	`[`
`650`		`- exir_ops.edge.et_vk.dequantize_q8to_from_conv2d.default,`
	`646`	`+ exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,`
	`647`	`+ exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,`
`651`	`648`	`]`
`652`	`649`	`)`
`653`		`-def register_dequantize_for_conv2d_op():`
	`650`	`+def register_dequantize_op():`
`654`	`651`	`return OpFeatures(`
`655`	`652`	`inputs_storage=[`
`656`	`653`	`utils.PACKED_INT8_4W4C_BUFFER,`
`657`	`654`	`],`
`658`	`655`	`outputs_storage=[`
`659`	`656`	`utils.CHANNELS_PACKED_TEXTURE_OR_CONTIGUOUS_BUFFER,`
`660`	`657`	`],`
`661`		`- supports_resize=False,`
`662`	`658`	`)`
`663`	`659`
`664`	`660`