diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp new file mode 100644 index 00000000000..3b40871a791 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Split.cpp @@ -0,0 +1,141 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include +#include +#include + +namespace vkcompute { + +void add_split_with_sizes_default_node( + ComputeGraph& graph, + ValueRef in, + const std::vector& split_sizes, + int64_t dim, + ValueRef out_list_ref) { + vTensorPtr t_in = graph.get_tensor(in); + + VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked)); + + ValueListPtr out_list = graph.get_value_list(out_list_ref); + + NchwDim nchw_dim = normalize_to_nchw_dim(*t_in, dim); + + VK_CHECK_COND(out_list->size() == split_sizes.size()); + + for (int split_idx = 0; split_idx < split_sizes.size(); split_idx++) { + int64_t split_size = split_sizes[split_idx]; + ValueRef out_ref = (*out_list)[split_idx]; + + vTensorPtr t_out = graph.get_tensor(out_ref); + VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked)); + VK_CHECK_COND(dim_at(*t_out, nchw_dim) == split_size); + } + + if (nchw_dim == DimWidth) { + api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false); + api::utils::ivec3 dst_offset = api::utils::make_ivec3({0, 0, 0}, false); + + for (ValueRef out_ref : *out_list) { + // Doesn't need to use split_size since we have already verified that the + // output tensor's size matches with the split_size. + vTensorPtr t_out = graph.get_tensor(out_ref); + api::utils::ivec3 range = t_out->texture_limits(); + add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref); + + src_offset.data[0] += range.data[0]; + } + } else if (nchw_dim == DimHeight) { + api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false); + api::utils::ivec3 dst_offset = api::utils::make_ivec3({0, 0, 0}, false); + + for (ValueRef out_ref : *out_list) { + vTensorPtr t_out = graph.get_tensor(out_ref); + api::utils::ivec3 range = t_out->texture_limits(); + add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref); + + src_offset.data[1] += range.data[1]; + } + } else if (nchw_dim == DimBatch) { + api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false); + api::utils::ivec3 dst_offset = api::utils::make_ivec3({0, 0, 0}, false); + + for (ValueRef out_ref : *out_list) { + vTensorPtr t_out = graph.get_tensor(out_ref); + api::utils::ivec3 range = t_out->texture_limits(); + add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref); + + src_offset.data[2] += range.data[2]; + } + } else if (nchw_dim == DimChannel) { + int32_t src_offset = 0; + int32_t dst_offset = 0; + + for (ValueRef out_ref : *out_list) { + vTensorPtr t_out = graph.get_tensor(out_ref); + int32_t range = dim_at(t_out->sizes()); + add_copy_channel_offset_node( + graph, in, range, src_offset, dst_offset, out_ref); + src_offset += range; + } + + } else { + VK_THROW("not ipmlemented"); + } +} + +void add_split_with_sizes_default_node( + ComputeGraph& graph, + ValueRef in, + ValueRef split_sizes_ref, + ValueRef dim_ref, + ValueRef out) { + int64_t dim = graph.extract_scalar(dim_ref); + std::vector split_sizes = *(graph.get_int_list(split_sizes_ref)); + + add_split_with_sizes_default_node(graph, in, split_sizes, dim, out); +} + +void split_with_sizes_default( + ComputeGraph& graph, + const std::vector& args) { + add_split_with_sizes_default_node(graph, args[0], args[1], args[2], args[3]); +} + +void add_split_tensor_node( + ComputeGraph& graph, + ValueRef in, + ValueRef split_size_ref, + ValueRef dim_ref, + ValueRef out) { + int64_t split_size = graph.extract_scalar(split_size_ref); + int64_t dim = graph.extract_scalar(dim_ref); + + vTensorPtr t_in = graph.get_tensor(in); + NchwDim nchw_dim = normalize_to_nchw_dim(*t_in, dim); + int64_t size = dim_at(*t_in, nchw_dim); + std::vector split_sizes(size / split_size, split_size); + + add_split_with_sizes_default_node(graph, in, split_sizes, dim, out); +} + +void split_tensor(ComputeGraph& graph, const std::vector& args) { + add_split_tensor_node(graph, args[0], args[1], args[2], args[3]); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(aten.split_with_sizes.default, split_with_sizes_default); + VK_REGISTER_OP(aten.split.Tensor, split_tensor); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h index e7b9a614e28..c5a47b7776a 100644 --- a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h +++ b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h @@ -12,6 +12,27 @@ namespace vkcompute { +// A canonical way to represent dimensions as enum. Motivation behind a +// canonical enum is that in the user tensor, it is using a "big-endian"-ish +// mechanism to reference a dimension in a nchw-tensor, leading to tensor of +// different dimension have different mapping from dim to the underlying texture +// dimension. For instasnce, for a 2d (height x width) tensors, dim 0 refers to +// height and dim 1 refers to width; for a 4d (batch x channel x height x width) +// tensor, dim 0 refers to batch and dim 1 refers to channel. Using this +// canonical enum allows us to bring clarity in code. + +enum NchwDim : uint32_t { + DimWidth = 1u, + DimHeight = 2u, + DimChannel = 3u, + DimBatch = 4u, +}; + +// Convert a dim provided by user into canonical enum. +inline NchwDim normalize_to_nchw_dim(const vTensor& v_in, int32_t dim) { + return static_cast(v_in.dim() - dim); +} + /* * Maps a semantic dimension name to an integer that * corresponds to its innermost ordering in a 4D tensor in @@ -20,10 +41,10 @@ namespace vkcompute { * corresponds to 2, and so on. */ struct Dim4D { - static constexpr uint32_t Width = 1u; - static constexpr uint32_t Height = 2u; - static constexpr uint32_t Channel = 3u; - static constexpr uint32_t Batch = 4u; + static constexpr uint32_t Width = DimWidth; + static constexpr uint32_t Height = DimHeight; + static constexpr uint32_t Channel = DimChannel; + static constexpr uint32_t Batch = DimBatch; }; /* @@ -65,34 +86,20 @@ uint32_t dim_at(const std::vector& sizes) { return dims < N ? 1 : api::utils::safe_downcast(sizes[dims - N]); } +inline uint32_t dim_at(const std::vector& sizes, NchwDim nchw_dim) { + const uint32_t dims = sizes.size(); + return dims < nchw_dim + ? 1 + : api::utils::safe_downcast(sizes[dims - nchw_dim]); +} + template uint32_t dim_at(const vTensor& v_in) { return dim_at(v_in.sizes()); } -// A canonical way to represent dimensions as enum. Intended to use the same -// value as Dim4D for potential future refactoring. - -enum NchwDim { - DimWidth = 1, - DimHeight = 2, - DimChannel = 3, - DimBatch = 4, -}; - -/* This function return a NchwDim - * given a Tensor and a user provided dim. The reason for this normalization is - * that in the user tensor coordinate, it is using a "big-endian" mechanism when - * referring to a nchw dimension, in that dim=0 refers to the batch dimension in - * a 4d tensor but dim=0 reference to height in a 2d tensor. Despite in a common - * texture representation of channel packing, a 2d tensor has exactly the same - * layout as a 4d with the batch and channel size equals to 1. This function - * returns a canonical dimension to simplify dimension reasoning in the code. - * - */ - -inline NchwDim normalize_to_nchw_dim(const vTensor& v_in, int32_t dim) { - return static_cast(v_in.dim() - dim); +inline uint32_t dim_at(const vTensor& v_in, NchwDim nchw_dim) { + return dim_at(v_in.sizes(), nchw_dim); } inline std::ostream& operator<<(std::ostream& os, NchwDim nchw_dim) { diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp index 842cfa2f4fc..f5e0d2b1713 100644 --- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp @@ -8,8 +8,6 @@ #include -#include - namespace vkcompute { // diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index f0659ad8232..a326402cc39 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -438,9 +438,7 @@ def get_cat_inputs(): ([(3, 5), (4, 5)], 0), ([(3, 5), (4, 5), (1, 5)], 0), ( - [ - (3, 5), - ], + [(3, 5)], 0, ), # Cat on Width @@ -449,9 +447,7 @@ def get_cat_inputs(): ([(5, 3), (5, 4)], 1), ([(5, 3), (5, 4), (5, 1)], 1), ( - [ - (5, 4), - ], + [(5, 4)], 1, ), ([(5,), (6,)], 0), @@ -474,6 +470,91 @@ def get_cat_inputs(): return test_suite +def get_split_with_sizes_inputs(): + Test = namedtuple("VkSliceTest", ["self", "sizes", "dim"]) + test_cases = [ + # Split on Width + Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=3), + Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=2), + Test(self=(7, 10, 10), sizes=[1, 9], dim=2), + Test(self=(10, 10), sizes=[1, 9], dim=1), + Test(self=(10,), sizes=[1, 9], dim=0), + # Split on Height + Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=2), + Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=1), + Test(self=(7, 10, 10), sizes=[10], dim=1), + Test(self=(7, 6, 10), sizes=[1, 1, 1, 1, 1, 1], dim=1), + Test(self=(10, 10), sizes=[1, 2, 3, 4], dim=0), + # Split on Batch + Test(self=(10, 7, 10, 10), sizes=[3, 6, 1], dim=0), + Test(self=(10, 7, 10, 10), sizes=[10], dim=0), + # Split on Channel + Test(self=(7, 13, 4, 8), sizes=[3, 6, 1, 3], dim=1), + Test(self=(7, 13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=1), + Test(self=(13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=0), + Test(self=(13, 4, 8), sizes=[2, 9, 2], dim=0), + Test(self=(13, 4, 8), sizes=[13], dim=0), + ] + test_suite = VkTestSuite([tuple(tc) for tc in test_cases]) + + test_suite.layouts = [ + "api::kChannelsPacked", + ] + test_suite.data_gen = "make_seq_tensor" + test_suite.dtypes = ["at::kFloat"] + return test_suite + + +def get_split_tensor_inputs(): + test_suite = VkTestSuite( + [ + # Split on Width + ((S1, 7, 10, 12), 12, 3), + ((S1, 7, 10, 12), 3, 3), + ((S1, 7, 10, 12), 1, 3), + ((7, 10, 12), 12, 2), + ((7, 10, 12), 3, 2), + ((7, 10, 12), 1, 2), + ((10, 12), 12, 1), + ((10, 12), 3, 1), + ((10, 12), 1, 1), + ((12,), 12, 0), + ((12,), 3, 0), + ((12,), 1, 0), + # Split on Height + ((S1, 7, 12, 8), 12, 2), + ((S1, 7, 12, 8), 3, 2), + ((S1, 7, 12, 8), 1, 2), + ((7, 12, 8), 12, 1), + ((7, 12, 8), 3, 1), + ((7, 12, 8), 1, 1), + ((12, 8), 12, 0), + ((12, 8), 3, 0), + ((12, 8), 1, 0), + # Split on Batch + ((12, 7, 10, 10), 12, 0), + ((12, 7, 10, 10), 3, 0), + ((12, 7, 10, 10), 1, 0), + # Split on Channel + ((7, 15, 10, 10), 15, 1), + ((7, 15, 10, 10), 5, 1), + ((7, 15, 10, 10), 3, 1), + ((7, 15, 10, 10), 1, 1), + ((15, 10, 10), 15, 0), + ((15, 10, 10), 5, 0), + ((15, 10, 10), 3, 0), + ((15, 10, 10), 1, 0), + ] + ) + + test_suite.layouts = [ + "api::kChannelsPacked", + ] + test_suite.data_gen = "make_seq_tensor" + test_suite.dtypes = ["at::kFloat"] + return test_suite + + test_suites = { "aten.add.Tensor": get_binary_elementwise_inputs(), "aten.sub.Tensor": get_binary_elementwise_inputs(), @@ -494,4 +575,6 @@ def get_cat_inputs(): "aten.clone.default": get_clone_inputs(), "aten.repeat.default": get_repeat_inputs(), "aten.cat.default": get_cat_inputs(), + "aten.split_with_sizes.default": get_split_with_sizes_inputs(), + "aten.split.Tensor": get_split_tensor_inputs(), } diff --git a/backends/vulkan/test/op_tests/utils/codegen.py b/backends/vulkan/test/op_tests/utils/codegen.py index ac5e25fa596..a43998b47c9 100644 --- a/backends/vulkan/test/op_tests/utils/codegen.py +++ b/backends/vulkan/test/op_tests/utils/codegen.py @@ -24,6 +24,7 @@ OPT_LAYOUT, OPT_MEMORY_FORMAT, OPT_SCALAR_TYPE, + TENSOR_VECTOR, TestSuite, TestSuiteGen, THREE_TENSOR_TUPLE, @@ -73,6 +74,26 @@ class ValueRef: is_out: bool = False requires_prepack: bool = False supports_prepack: bool = False + # When is_dynamic_size is true, the underlying object size is not known + # during code-gen. Example is the out value for aten.split where the out + # value is a vector. In these cases, we need to use an additional + # vector or at::TensorList to track these values. + is_dynamic_size: bool = False + + @property + def io_value_list_name(self): + assert self.is_dynamic_size + return f"{self.name}_io_value_list" + + @property + def value_list_name(self): + assert self.is_dynamic_size + return f"{self.name}_value_list" + + @property + def vk_out(self): + assert self.is_out + return f"vk_{self.name}" ValueRefList = Union[ValueRef, List[ValueRef]] @@ -177,6 +198,18 @@ def __init__(self, op_reg_name: str, f: NativeFunction, suite_def: TestSuite): is_out=False, ), ] + elif ret_type == TENSOR_VECTOR: + self.refs["out"] = ValueRef( + name="out_ref", + src_cpp_name="out", + src_cpp_type=ret_type, + is_out=True, + is_dynamic_size=True, + ) + else: + raise NotImplementedError( + f"ret_type: {ret_type} not supported for out value" + ) ## ATen code generation @@ -267,6 +300,21 @@ def create_value_for(self, ref: ValueRefList) -> str: # noqa: C901 ret_str += "}\n" ret_str += f"ValueRef {ref.name} = {self.graph}{self.dot}add_value_list(std::move({ref.name}_value_refs));\n" return ret_str + elif ref.src_cpp_type == TENSOR_VECTOR: + ret_str = f""" +std::vector {ref.io_value_list_name}; +std::vector {ref.value_list_name}; +for (int i=0; i str: for r in ref[:-1]: ret_str += self.set_output(r) return ret_str + elif ref.src_cpp_type == TENSOR_VECTOR: + assert ref.is_out + ret_str = f""" +for (int i=0; i str: for r in ref[:-1]: ret_str += self.declare_vk_out_for(r) return ret_str + elif ref.src_cpp_type == TENSOR_VECTOR: + assert ref.is_out + ret_str = f""" +std::vector {ref.vk_out}; +for (int i=0; i str: for r in ref[:-1]: ret_str += self.copy_from_staging(r) return ret_str + elif ref.src_cpp_type == TENSOR_VECTOR: + assert ref.is_out + ret_str = f""" +for (int i=0; i str: for r in ref[:-1]: ret_str += self.check_graph_out(r) return ret_str + elif ref.src_cpp_type == TENSOR_VECTOR: + assert ref.is_out + ret_str = f""" +for (int i=0; i