[ET-VK] Adding all tensor packing support for repeat op. (#9813)

pytorchbot · kirklandsign · commit dda8f4e0c246 · 2025-04-11T14:32:59.000-07:00
This diff updates Executorch Vulkan backend's `repeat` operation to support width, height and channel packed tensors. It also updates the op_registry.py file to indicate `repeat` operation supports all packing. Differential Revision: [D71477633](https://our.internmc.facebook.com/intern/diff/D71477633/)
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -527,8 +527,6 @@ def register_view_op(features: OpFeatures):
         exir_ops.edge.aten.flip.default,
         exir_ops.edge.aten.index_select.default,
         exir_ops.edge.aten.select_copy.int,
-        # Tensor combination
-        exir_ops.edge.aten.repeat.default,
         # Tensor creation
         exir_ops.edge.aten.arange.start_step,
         exir_ops.edge.aten.clone.default,
@@ -561,6 +559,7 @@ def register_ported_op(features: OpFeatures):
         exir_ops.edge.aten.permute_copy.default,
         # Tensor combination
         exir_ops.edge.aten.cat.default,
+        exir_ops.edge.aten.repeat.default,
         exir_ops.edge.aten.split_with_sizes_copy.default,
         exir_ops.edge.aten.split.Tensor,
     ]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
@@ -45,7 +45,7 @@ const lowp int packed_dim = unhash_packed_dim(out_layout);
 ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
 const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
 
-${layout_declare_spec_const(C, "bool", "repeat", "false")}
+${layout_declare_spec_const(C, "int", "repeat", "0")}
 
 void no_repeat_copy(ivec3 pos) {
   // Position in input tensor
@@ -229,7 +229,7 @@ void main() {
     return;
   }
 
-  if (repeat) {
+  if (repeat == 1) {
     repeat_copy(pos);
   } else {
     no_repeat_copy(pos);
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
@@ -151,7 +151,7 @@ void add_copy_packed_dim_offset_node(
       // Parameter buffers
       {},
       // Specialization Constants
-      {graph.hashed_layout_of(out), graph.hashed_layout_of(in), repeat},
+      {graph.hashed_layout_of(out), graph.hashed_layout_of(in), repeat ? 1 : 0},
       nullptr,
       {},
       {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
@@ -23,8 +23,7 @@ void check_args(
     const api::vTensor& in,
     const std::vector<int64_t>& repeats,
     const api::vTensor& out) {
-  VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
-  VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
+  VK_CHECK_COND(check_same_packed_dim(in, out));
 
   VK_CHECK_COND(in.storage_type() == out.storage_type());
   if (in.storage_type() == utils::kTexture2D) {
@@ -59,147 +58,29 @@ void check_args(
 
 } // namespace
 
-void add_repeat_channel_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    int64_t repeat_channel,
-    ValueRef out,
-    utils::ivec3& running_range) {
-  vTensorPtr t_in = graph.get_tensor(in);
-  vTensorPtr t_out = graph.get_tensor(out);
-
-  std::string kernel_name = "repeat_channel";
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, *t_out);
-
-  const std::vector<int64_t>& in_sizes = t_in->sizes();
-
-  int32_t in_width = utils::safe_downcast<int32_t>(dim_at<kWidth4D>(in_sizes));
-  int32_t in_height =
-      utils::safe_downcast<int32_t>(dim_at<kHeight4D>(in_sizes));
-  int32_t in_channel =
-      utils::safe_downcast<int32_t>(dim_at<kChannel4D>(in_sizes));
-  int32_t in_batch = utils::safe_downcast<int32_t>(dim_at<kBatch4D>(in_sizes));
-
-  int32_t out_channel = repeat_channel * in_channel;
-
-  utils::ivec4 out_whcn_sizes{in_width, in_height, out_channel, in_batch};
-
-  utils::ivec4 in_whcn_sizes{in_width, in_height, in_channel, in_batch};
-
-  // Channel packed global work ids
-  running_range[2] = out_whcn_sizes[3] * utils::div_up_4(out_whcn_sizes[2]);
-  utils::uvec3 global_size = utils::make_uvec3(running_range);
-  utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
-  const struct Block final {
-    utils::ivec4 out_sizes;
-    utils::ivec4 in_size;
-  } repeat_channel_args{
-      out_whcn_sizes,
-      in_whcn_sizes,
-  };
-
-  auto shader = VK_KERNEL_FROM_STR(kernel_name);
-
-  graph.execute_nodes().emplace_back(new DispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
-      // Inputs and Outputs
-      {{out, vkapi::MemoryAccessType::WRITE},
-       {in, vkapi::MemoryAccessType::READ}},
-      // Parameter buffers
-      {graph.create_params_buffer(repeat_channel_args)},
-      // Specialization Constants
-      {SV(t_out->packed_dim())}));
-}
-
 void add_repeat_node(
     ComputeGraph& graph,
     ValueRef in,
     ValueRef repeats_ref,
     ValueRef out) {
-  std::vector<int64_t> repeats = *(graph.get_int_list(repeats_ref));
+  const std::vector<int64_t> repeats = *(graph.get_int_list(repeats_ref));
 
   vTensorPtr t_in = graph.get_tensor(in);
   vTensorPtr t_out = graph.get_tensor(out);
   check_args(*t_in, repeats, *t_out);
 
-  // In this function, we expand the dimensions in the following order:
-  // 1. Channel
-  // 2. Width
-  // 3. Height
-  // 4. Batch
-  // After expanding a dimension, we will update the "running_range" since we
-  // will need to copy the "expanded" area.
-
-  utils::ivec3 running_range = t_in->logical_limits();
-
-  const std::vector<int64_t>& in_sizes = t_in->sizes();
-
-  // Since we use channel packing, repeating the channel dimension is the most
-  // complicated and time-consuming, as we need to reason over misaligned
-  // channels. Hence we expand it first to minimize cost. Also, in this first
-  // dimension, we copy over the input texure to the output. In subsequent
-  // dimensions, we read and write from the same tensor.
-
-  if (int64_t channel_repeat = dim_at<kChannel4D>(repeats);
-      channel_repeat == 1) {
-    // If no repeat, short-cut to a direct copy
-    utils::ivec4 src_offset{0, 0, 0, 0};
-    utils::ivec4 dst_offset{0, 0, 0, 0};
-
-    add_copy_offset_node(
-        graph, in, running_range, src_offset, dst_offset, out, false, false);
-
-  } else {
-    add_repeat_channel_node(graph, in, channel_repeat, out, running_range);
-  }
-
-  // TODO: refactor width, height, and batch into a common helper function.
-  // Width
-  if (int64_t width_repeat = dim_at<kWidth4D>(repeats); width_repeat > 1) {
-    utils::ivec4 src_offset{0, 0, 0, 0};
-
-    for (int i = 1; i < width_repeat; ++i) {
-      utils::ivec4 dst_offset{i * dim_at<kWidth4D>(in_sizes), 0, 0, 0};
-
-      add_copy_offset_node(
-          graph, out, running_range, src_offset, dst_offset, out, true, false);
-    }
-
-    running_range[0] = running_range[0] * width_repeat;
-  }
-
-  // Height
-  if (int64_t height_repeat = dim_at<kHeight4D>(repeats); height_repeat > 1) {
-    utils::ivec4 src_offset{0, 0, 0, 0};
-
-    for (int i = 1; i < height_repeat; ++i) {
-      utils::ivec4 dst_offset = {0, i * dim_at<kHeight4D>(in_sizes), 0, 0};
-
-      add_copy_offset_node(
-          graph, out, running_range, src_offset, dst_offset, out, true, false);
-    }
-
-    running_range[1] = running_range[1] * height_repeat;
-  }
-
-  // Batch
-  if (int64_t batch_repeat = dim_at<kBatch4D>(repeats); batch_repeat > 1) {
-    utils::ivec4 src_offset{0, 0, 0, 0};
-
-    for (int i = 1; i < batch_repeat; ++i) {
-      utils::ivec4 dst_offset = {0, 0, i * running_range[2], 0};
-
-      add_copy_offset_node(
-          graph, out, running_range, src_offset, dst_offset, out, true, false);
-    }
-
-    running_range[2] = running_range[2] * batch_repeat;
-  }
+  const utils::ivec4 src_offset{
+      dim_at<kWidth4D>(t_in->sizes()),
+      dim_at<kHeight4D>(t_in->sizes()),
+      dim_at<kChannel4D>(t_in->sizes()),
+      dim_at<kBatch4D>(t_in->sizes())};
+  const utils::ivec4 dst_offset{
+      dim_at<kWidth4D>(repeats),
+      dim_at<kHeight4D>(repeats),
+      dim_at<kChannel4D>(repeats),
+      dim_at<kBatch4D>(repeats)};
+  add_copy_packed_dim_offset_node(
+      graph, in, t_out->logical_limits(), src_offset, dst_offset, out, true);
 }
 
 void repeat(ComputeGraph& graph, const std::vector<ValueRef>& args) {
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -756,7 +756,11 @@ def get_repeat_inputs():
             ((2, 3), [3, 1, 4]),
         ]
     )
-    test_suite_2d.layouts = ["utils::kChannelsPacked"]
+    test_suite_2d.layouts = [
+        "utils::kWidthPacked",
+        "utils::kHeightPacked",
+        "utils::kChannelsPacked",
+    ]
     test_suite_2d.storage_types = ["utils::kTexture2D"]
     test_suite_2d.data_gen = "make_seq_tensor"
     test_suite_2d.dtypes = ["at::kFloat"]
@@ -797,7 +801,11 @@ def get_repeat_inputs():
             ((2, 3), [3, 3, 2, 4]),
         ]
     )
-    test_suite_3d.layouts = ["utils::kChannelsPacked"]
+    test_suite_3d.layouts = [
+        "utils::kWidthPacked",
+        "utils::kHeightPacked",
+        "utils::kChannelsPacked",
+    ]
     test_suite_3d.storage_types = ["utils::kTexture3D"]
     test_suite_3d.data_gen = "make_seq_tensor"
     test_suite_3d.dtypes = ["at::kFloat"]

Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,7 @@ void add_copy_packed_dim_offset_node(`
`151`	`151`	`// Parameter buffers`
`152`	`152`	`{},`
`153`	`153`	`// Specialization Constants`
`154`		`- {graph.hashed_layout_of(out), graph.hashed_layout_of(in), repeat},`
	`154`	`+ {graph.hashed_layout_of(out), graph.hashed_layout_of(in), repeat ? 1 : 0},`
`155`	`155`	`nullptr,`
`156`	`156`	`{},`
`157`	`157`	`{`