diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl index e0f09f0be43..c80cc9aa6dc 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl @@ -20,9 +20,17 @@ ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} layout(push_constant) uniform restrict Block { ivec4 range; + + // if not repeating // xyz is source offset w is channel size + // if repeating + // xyzw is source tensor sizes in WHCB dims respectively ivec4 src_offset; + + // if not repeating // xyz is destination offset w is channel size + // if repeating + // xyzw is destination tensor sizes in WHCB dims respectively ivec4 dst_offset; }; @@ -37,13 +45,9 @@ const lowp int packed_dim = unhash_packed_dim(out_layout); ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, range.xyz))) { - return; - } +${layout_declare_spec_const(C, "bool", "repeat", "false")} +void no_repeat_copy(ivec3 pos) { // Position in input tensor ivec3 in_pos = pos + src_offset.xyz; in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2); @@ -138,3 +142,103 @@ void main() { out_value, out_axis_map); } + +void repeat_copy(ivec3 pos) { + // expand position in packed dim + pos[packed_dim] <<= 2; + + // channel size aligned by 4 when tensors are channel packed raw value otherwise + const int channel_size = (packed_dim == C_DIM ? alignup4(src_offset.z) : src_offset.z); + + // find input texel's WHCB index + const int width_index = pos.x % src_offset.x; + const int height_index = pos.y % src_offset.y; + int channel_index; + int batch_index; + + // if tensors are channel packed + if (packed_dim == C_DIM) { + // the output channels in a batch will be channel size * channel repetitions aligned by 4 + const int out_channel_size = alignup4(src_offset.z * dst_offset.z); + + // batch index in the output + const int out_pos_batch_index = pos.z / out_channel_size; + + // source batch index for based on current output pos + batch_index = out_pos_batch_index % src_offset.w; + + // batch repetition count for current output pos + const int batch_repetition_index = out_pos_batch_index / src_offset.w; + + // calculate input channel index based on current output pos and batch index + // its done this way because we want source channel to restart from zero when a batch index increments + // also batch_index will reset to zero after hitting batch repetition count + // so track the current repetition in batch_repetition_index so it can be used for determining current_index + channel_index = (pos.z - (batch_index + batch_repetition_index * src_offset.w) * out_channel_size) % src_offset.z; + } else { + // the output channels in a batch will be channel size * channel repetitions + const int out_channel_size = src_offset.z * dst_offset.z; + + // source batch index for based on current output pos + batch_index = (pos.z / out_channel_size) % src_offset.w; + + // source channel index is current output pos wrapped based on channel count + channel_index = pos.z % src_offset.z; + } + + // input texel's WCB position + const ivec3 in_pos = ivec3(width_index, height_index, channel_index); + + // squeeze position in packed dim + pos[packed_dim] >>= 2; + + // packed dim index of texel last fetched + int fetched_in_pos_packed_dim = -1; + + // fetched input texel + VEC4_T in_value; + + // output texel value + VEC4_T out_value = VEC4_T(0); + + int src_lane_offset = in_pos[packed_dim]; + + for (int i=0; i<4; i++) { + if ((src_lane_offset >> 2) != fetched_in_pos_packed_dim) { + fetched_in_pos_packed_dim = (src_lane_offset >> 2); + + ivec3 curr_in_pos = in_pos; + curr_in_pos[packed_dim] = src_lane_offset; + curr_in_pos.z = curr_in_pos.z + batch_index * channel_size; + curr_in_pos[packed_dim] >>= 2; + + in_value = load_texel_lpos(t_in, curr_in_pos, in_axis_map); + } + + out_value[i] = in_value[src_lane_offset & 0x3]; + + src_lane_offset++; + // if packed index exceeded source packed dim round to zero + src_lane_offset = mix(src_lane_offset, 0, src_lane_offset >= src_offset[packed_dim]); + } + + write_texel_lpos( + t_out, + pos, + out_value, + out_axis_map); +} + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (any(greaterThanEqual(pos, range.xyz))) { + return; + } + + if (repeat) { + repeat_copy(pos); + } else { + no_repeat_copy(pos); + } +} diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp index 5756d3a9052..d006dee74a7 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp @@ -71,61 +71,68 @@ void add_copy_packed_dim_offset_node( const ivec3& range, const ivec4& src_offset, const ivec4& dst_offset, - const ValueRef out) { + const ValueRef out, + bool repeat) { vTensorPtr t_in = graph.get_tensor(in); vTensorPtr t_out = graph.get_tensor(out); - // Check the packed dimension is same for both tensors, and if the packed - // dimension is Width or Height. Since the function does not support channel - // packing. - VK_CHECK_COND( - check_same_packed_dim(*t_in, *t_out) && - (check_packed_dim_is(*t_in, WHCN::kWidthDim) || - check_packed_dim_is(*t_in, WHCN::kHeightDim))); + // Check the packed dimension is same for both tensors + VK_CHECK_COND(check_same_packed_dim(*t_in, *t_out)); + if (!repeat) { + // For non repeat copy also check if the packed dimension is Width or + // Height. Since the function does not support channel packing. + VK_CHECK_COND( + check_same_packed_dim(*t_in, *t_out) && + (check_packed_dim_is(*t_in, WHCN::kWidthDim) || + check_packed_dim_is(*t_in, WHCN::kHeightDim))); + } std::string kernel_name = "copy_packed_dim_offset"; kernel_name.reserve(kShaderNameReserve); add_dtype_suffix(kernel_name, *t_out); - const auto packed_dim = t_in->packed_dim(); // A copy of range with the last element set to batch size of the input tensor ivec4 final_range = { range[0], range[1], range[2], dim_at(t_in->sizes(), kBatch4D)}; ivec3 global_wg_size = t_out->logical_limits(); - // The starting offset in a texel where this tensor will start copying from - const auto src_lane_offset = src_offset[packed_dim] & 0x3; - // The starting offset in a texel where this tensor will start copying to - const auto dst_lane_offset = dst_offset[packed_dim] & 0x3; - - // The total packed texels this tensor will be copied from - // The first texel of tensor data in packed dimension will be copied from - // remaining lanes from current source Hence (4 - src_lane_offset) is added - // to tensor size in packed dimension - const auto src_packed_size = utils::div_up_4( - (4 - src_lane_offset) + - dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim))); - - // The total packed texels this tensor will be copied to - // The first texel of tensor data in packed dimension will be copied to - // remaining lanes from previous write Hence (4 - dst_lane_offset) is added to - // tensor size in packed dimension - const auto dst_packed_size = utils::div_up_4( - (4 - dst_lane_offset) + - dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim))); - - // If the starting src offset is not 0, and the total packed texels is greater - // than the source texel range - const bool has_additional_src_work = - src_lane_offset != 0 && src_packed_size > final_range[packed_dim]; - // If the starting dst offset is not 0, and the total packed texels is greater - // than the source texel range - const bool has_additional_dst_work = - dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim]; - - if (has_additional_src_work || has_additional_dst_work) { - global_wg_size[packed_dim]++; // Increase the global work group size in - // packed dimension - final_range[packed_dim]++; // Increase the range in packed dimension + + if (!repeat) { + const auto packed_dim = t_in->packed_dim(); + // The starting offset in a texel where this tensor will start copying from + const auto src_lane_offset = src_offset[packed_dim] & 0x3; + // The starting offset in a texel where this tensor will start copying to + const auto dst_lane_offset = dst_offset[packed_dim] & 0x3; + + // The total packed texels this tensor will be copied from + // The first texel of tensor data in packed dimension will be copied from + // remaining lanes from current source Hence (4 - src_lane_offset) is added + // to tensor size in packed dimension + const auto src_packed_size = utils::div_up_4( + (4 - src_lane_offset) + + dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim))); + + // The total packed texels this tensor will be copied to + // The first texel of tensor data in packed dimension will be copied to + // remaining lanes from previous write Hence (4 - dst_lane_offset) is added + // to tensor size in packed dimension + const auto dst_packed_size = utils::div_up_4( + (4 - dst_lane_offset) + + dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim))); + + // If the starting src offset is not 0, and the total packed texels is + // greater than the source texel range + const bool has_additional_src_work = + src_lane_offset != 0 && src_packed_size > final_range[packed_dim]; + // If the starting dst offset is not 0, and the total packed texels is + // greater than the source texel range + const bool has_additional_dst_work = + dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim]; + + if (has_additional_src_work || has_additional_dst_work) { + global_wg_size[packed_dim]++; // Increase the global work group size in + // packed dimension + final_range[packed_dim]++; // Increase the range in packed dimension + } } auto shader = VK_KERNEL_FROM_STR(kernel_name); @@ -144,7 +151,7 @@ void add_copy_packed_dim_offset_node( // Parameter buffers {}, // Specialization Constants - {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}, + {graph.hashed_layout_of(out), graph.hashed_layout_of(in), repeat}, nullptr, {}, { diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.h b/backends/vulkan/runtime/graph/ops/impl/Copy.h index e9388345afa..9761d571caf 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.h +++ b/backends/vulkan/runtime/graph/ops/impl/Copy.h @@ -53,13 +53,16 @@ void add_copy_offset_node( // dst_offset (all are in texture coordinate (x, y, z) from the input image to // the output image. // +// repeat flag is used to indicate if copy should wrap around tensor dim. +// only true for repeat op. void add_copy_packed_dim_offset_node( ComputeGraph& graph, const ValueRef in, const utils::ivec3& range, const utils::ivec4& src_offset, const utils::ivec4& dst_offset, - const ValueRef out); + const ValueRef out, + bool repeat = false); // add_copy_channel_offset_node behaves similar to add_copy_node, except that it // works on the channel dimensions of the tensor (up to 4 dimensions in NCHW).