From d286fb1192c43a754686b1bc0e35c80a2618308b Mon Sep 17 00:00:00 2001 From: Vivek Trivedi Date: Fri, 7 Mar 2025 08:23:28 -0800 Subject: [PATCH] Modifying slice op to support all tensor packing. (#9030) Summary: This diff updates Executorch Vulkan backend's `slice_copy` operation to support width, height and channel packed tensors. It also updates the `op_registry.py` file to register the `slice_copy` operation and adds a new test case to the `cases.py` file to test the operation. Additionally, it updates the `Slice.cpp` file to check for the same packed dimension in the input and output tensors, and updates the `cases.py` file to include the `utils::kWidthPacked`, `utils::kHeightPacked`, and `utils::kChannelsPacked` layouts. Reviewed By: SS-JIA Differential Revision: D70559149 --- backends/vulkan/op_registry.py | 14 ++++++- .../ops/glsl/slice_batch_height_width.glsl | 26 +++++++++---- .../runtime/graph/ops/glsl/slice_channel.glsl | 4 +- .../vulkan/runtime/graph/ops/impl/Slice.cpp | 39 ++++++++++++------- backends/vulkan/test/op_tests/cases.py | 6 ++- 5 files changed, 64 insertions(+), 25 deletions(-) diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py index 25cf74dc8f2..a26974a5f0a 100644 --- a/backends/vulkan/op_registry.py +++ b/backends/vulkan/op_registry.py @@ -530,7 +530,6 @@ def register_view_op(features: OpFeatures): exir_ops.edge.aten.flip.default, exir_ops.edge.aten.index_select.default, exir_ops.edge.aten.select_copy.int, - exir_ops.edge.aten.slice_copy.Tensor, # Tensor combination exir_ops.edge.aten.cat.default, exir_ops.edge.aten.split_with_sizes_copy.default, @@ -557,6 +556,19 @@ def register_ported_op(features: OpFeatures): return features +@update_features( + [ + # Indexing and lookup + exir_ops.edge.aten.slice_copy.Tensor, + ] +) +def register_ported_op_all_packed_dims(features: OpFeatures): + features.texture_impl = TextureImplFeatures( + valid_packed_dims=all_packed_dims, + ) + return features + + # Ported ops that support their own prepacking. @update_features( [ diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.glsl index 72594830cd4..54f0bd0b78c 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.glsl @@ -27,8 +27,7 @@ layout(set = 0, binding = 3) uniform PRECISION restrict SliceArg { int dim; int offset; int step; - // Used when dim=batch. Stride is the # of plances for each batch value. - int stride; + int image_in_channel_size; } slice_arg; @@ -45,11 +44,24 @@ void main() { ivec3 in_pos = pos; - int index = pos[slice_arg.dim] / slice_arg.stride; - int within_stride = pos[slice_arg.dim] % slice_arg.stride; - - in_pos[slice_arg.dim] = slice_arg.offset * slice_arg.stride + index * slice_arg.step * - slice_arg.stride + within_stride; + // slice along batch axis + if (slice_arg.dim == 3) { + // index of the channel inside a batch + const int chanl_index = pos.z % slice_arg.image_in_channel_size; + // index of batch + const int batch_index = pos.z / slice_arg.image_in_channel_size; + in_pos.z = (slice_arg.offset + batch_index * slice_arg.step) * slice_arg.image_in_channel_size + chanl_index; + } else if (slice_arg.dim == C_DIM) { + // index of the channel inside a batch + const int chanl_index = pos.z % sizes.z; + // index of batch + const int batch_index = pos.z / sizes.z; + in_pos.z = slice_arg.offset + batch_index * slice_arg.image_in_channel_size + chanl_index * slice_arg.step; + } else if (slice_arg.dim == H_DIM) { + in_pos.y = slice_arg.offset + pos.y * slice_arg.step; + } else { + in_pos.x = slice_arg.offset + pos.x * slice_arg.step; + } imageStore(image_out, pos, texelFetch(image_in, in_pos, 0)); diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl index 45e6c3358e8..0a6fa31a65f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl @@ -49,10 +49,10 @@ void main() { for (int i=0;i<4;i++) { ivec4 user_coor = nchwi_to_tidx(buf_indices[i], out_sizes); - int in_channel = user_coor.z; + int in_dim = user_coor[packed_dim]; ivec4 in_user_coor = user_coor; - in_user_coor.z = slice_arg.offset + in_channel * slice_arg.step; + in_user_coor[packed_dim] = slice_arg.offset + in_dim * slice_arg.step; ivec4 in_pow_elem = to_texture_elem_pos( in_user_coor, diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp index 40603394660..48584880583 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp @@ -44,8 +44,7 @@ void add_slice_tensor_copy_node( vTensorPtr t_in = graph.get_tensor(in); vTensorPtr t_out = graph.get_tensor(out); - VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim)); - VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim)); + VK_CHECK_COND(check_same_packed_dim(*t_in, *t_out)); // Need normalize the dim int64_t dim = graph.extract_scalar(dim_ref); @@ -76,7 +75,13 @@ void add_slice_tensor_copy_node( start = normalize_idx(start, in_sizes[dim], 0); end = normalize_idx(end, in_sizes[dim], in_sizes[dim]); - if (dim_index == kChannel4D) { + const vkapi::SpecVarList spec_vars = {t_in->packed_dim()}; + + const auto packed_dim_idx = + static_cast(DimIndex::DIM_LAST - t_in->packed_dim()); + + // if slice dim is the same as the packed dim, we can use the channel slice + if (dim_index == packed_dim_idx) { // slice by channel std::string kernel_name = "slice_channel"; kernel_name.reserve(kShaderNameReserve); @@ -99,26 +104,31 @@ void add_slice_tensor_copy_node( {in, vkapi::MemoryAccessType::READ}}, {t_out->sizes_ubo(), t_in->sizes_ubo(), - graph.create_params_buffer(params)})); + graph.create_params_buffer(params)}, + spec_vars)); } else { // GPU's coordinate is in x, y, z int64_t gpu_dim = -1; - int64_t stride = 1; + int64_t in_channel_stride = 1; if (dim_index == kWidth4D) { gpu_dim = 0; // width: x dimension in gpu VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step)); } else if (dim_index == kHeight4D) { gpu_dim = 1; // height: y dimension VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step)); - } else if (dim_index == kBatch4D) { - gpu_dim = 2; // batch: z dimension - - // Due to channel packing, each batch value is span over stride planes - int64_t n_channels = dim_at(in_sizes, kChannel4D); - stride = utils::div_up_4(n_channels); + } else if (dim_index == kChannel4D) { + gpu_dim = 2; // channel: z dimension + VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step)); + in_channel_stride = dim_at(in_sizes, kChannel4D); } else { - VK_THROW("Unexpected ncwh_dim!"); + gpu_dim = 3; // batch: w dimension + + in_channel_stride = dim_at(in_sizes, kChannel4D); + if (packed_dim_idx == kChannel4D) { + // Due to channel packing, each batch value is span over stride planes + in_channel_stride = utils::div_up_4(in_channel_stride); + } } std::string kernel_name = "slice_batch_height_width"; @@ -137,7 +147,7 @@ void add_slice_tensor_copy_node( static_cast(gpu_dim), static_cast(start), static_cast(step), - static_cast(stride), + static_cast(in_channel_stride), }; graph.execute_nodes().emplace_back(new DispatchNode( @@ -147,7 +157,8 @@ void add_slice_tensor_copy_node( local_size, {{out, vkapi::MemoryAccessType::WRITE}, {in, vkapi::MemoryAccessType::READ}}, - {t_out->sizes_ubo(), graph.create_params_buffer(params)})); + {t_out->sizes_ubo(), graph.create_params_buffer(params)}, + spec_vars)); } } diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index 38d87240b80..095df8b6677 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -585,7 +585,11 @@ def get_slice_out_inputs(): test_suite = VkTestSuite([tuple(tc) for tc in test_cases]) test_suite.dtypes = ["at::kFloat", "at::kHalf"] - test_suite.layouts = ["utils::kChannelsPacked"] + test_suite.layouts = [ + "utils::kWidthPacked", + "utils::kHeightPacked", + "utils::kChannelsPacked", + ] test_suite.data_gen = "make_seq_tensor" return test_suite