Skip to content

[ET-VK] Moving repeat functionality from copy_packed_dim_offset into a separate repeat shader. #9428

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 6 additions & 108 deletions backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,10 @@ ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
layout(push_constant) uniform restrict Block {
ivec4 range;

// if not repeating
// xyz is source offset w is channel size
// if repeating
// xyzw is source tensor sizes in WHCB dims respectively
ivec4 src_offset;

// if not repeating
// xyz is destination offset w is channel size
// if repeating
// xyzw is destination tensor sizes in WHCB dims respectively
ivec4 dst_offset;
};

Expand All @@ -45,9 +39,13 @@ const lowp int packed_dim = unhash_packed_dim(out_layout);
${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);

${layout_declare_spec_const(C, "int", "repeat", "0")}
void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);

if (any(greaterThanEqual(pos, range.xyz))) {
return;
}

void no_repeat_copy(ivec3 pos) {
// Position in input tensor
ivec3 in_pos = pos + src_offset.xyz;
in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2);
Expand Down Expand Up @@ -135,103 +133,3 @@ void no_repeat_copy(ivec3 pos) {
out_value,
out_axis_map);
}

void repeat_copy(ivec3 pos) {
// expand position in packed dim
pos[packed_dim] <<= 2;

// channel size aligned by 4 when tensors are channel packed raw value otherwise
const int channel_size = (packed_dim == C_DIM ? alignup4(src_offset.z) : src_offset.z);

// find input texel's WHCB index
const int width_index = pos.x % src_offset.x;
const int height_index = pos.y % src_offset.y;
int channel_index;
int batch_index;

// if tensors are channel packed
if (packed_dim == C_DIM) {
// the output channels in a batch will be channel size * channel repetitions aligned by 4
const int out_channel_size = alignup4(src_offset.z * dst_offset.z);

// batch index in the output
const int out_pos_batch_index = pos.z / out_channel_size;

// source batch index for based on current output pos
batch_index = out_pos_batch_index % src_offset.w;

// batch repetition count for current output pos
const int batch_repetition_index = out_pos_batch_index / src_offset.w;

// calculate input channel index based on current output pos and batch index
// its done this way because we want source channel to restart from zero when a batch index increments
// also batch_index will reset to zero after hitting batch repetition count
// so track the current repetition in batch_repetition_index so it can be used for determining current_index
channel_index = (pos.z - (batch_index + batch_repetition_index * src_offset.w) * out_channel_size) % src_offset.z;
} else {
// the output channels in a batch will be channel size * channel repetitions
const int out_channel_size = src_offset.z * dst_offset.z;

// source batch index for based on current output pos
batch_index = (pos.z / out_channel_size) % src_offset.w;

// source channel index is current output pos wrapped based on channel count
channel_index = pos.z % src_offset.z;
}

// input texel's WCB position
const ivec3 in_pos = ivec3(width_index, height_index, channel_index);

// squeeze position in packed dim
pos[packed_dim] >>= 2;

// packed dim index of texel last fetched
int fetched_in_pos_packed_dim = -1;

// fetched input texel
VEC4_T in_value;

// output texel value
VEC4_T out_value = VEC4_T(0);

int src_lane_offset = in_pos[packed_dim];

for (int i=0; i<4; i++) {
if ((src_lane_offset >> 2) != fetched_in_pos_packed_dim) {
fetched_in_pos_packed_dim = (src_lane_offset >> 2);

ivec3 curr_in_pos = in_pos;
curr_in_pos[packed_dim] = src_lane_offset;
curr_in_pos.z = curr_in_pos.z + batch_index * channel_size;
curr_in_pos[packed_dim] >>= 2;

in_value = load_texel_lpos(t_in, curr_in_pos, in_axis_map);
}

out_value[i] = in_value[src_lane_offset & 0x3];

src_lane_offset++;
// if packed index exceeded source packed dim round to zero
src_lane_offset = mix(src_lane_offset, 0, src_lane_offset >= src_offset[packed_dim]);
}

write_texel_lpos(
t_out,
pos,
out_value,
out_axis_map);
}

void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);

if (any(greaterThanEqual(pos, range.xyz))) {
return;
}

if (repeat == 1) {
repeat_copy(pos);
} else {
no_repeat_copy(pos);
}
}
129 changes: 129 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/repeat.glsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}

#define VEC4_T ${texel_type(DTYPE)}

layout(std430) buffer;

${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}

layout(push_constant) uniform restrict Block {
ivec4 range;
// source tensor sizes in WHCB dims respectively
ivec4 src_dims;
// destination tensor repeats in WHCB dims respectively
ivec4 dst_repeats;
};

#include "indexing_utils.h"

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
const lowp int packed_dim = unhash_packed_dim(out_layout);

${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);

void main() {
ivec3 pos = ivec3(gl_GlobalInvocationID);

if (any(greaterThanEqual(pos, range.xyz))) {
return;
}

// expand position in packed dim
pos[packed_dim] <<= 2;

// channel size aligned by 4 when tensors are channel packed raw value otherwise
const int channel_size = (packed_dim == C_DIM ? alignup4(src_dims.z) : src_dims.z);

// find input texel's WHCB index
const int width_index = pos.x % src_dims.x;
const int height_index = pos.y % src_dims.y;
int channel_index;
int batch_index;

// if tensors are channel packed
if (packed_dim == C_DIM) {
// the output channels in a batch will be channel size * channel repetitions aligned by 4
const int out_channel_size = alignup4(src_dims.z * dst_repeats.z);

// batch index in the output
const int out_pos_batch_index = pos.z / out_channel_size;

// source batch index for based on current output pos
batch_index = out_pos_batch_index % src_dims.w;

// batch repetition count for current output pos
const int batch_repetition_index = out_pos_batch_index / src_dims.w;

// calculate input channel index based on current output pos and batch index
// its done this way because we want source channel to restart from zero when a batch index increments
// also batch_index will reset to zero after hitting batch repetition count
// so track the current repetition in batch_repetition_index so it can be used for determining current_index
channel_index = (pos.z - (batch_index + batch_repetition_index * src_dims.w) * out_channel_size) % src_dims.z;
} else {
// the output channels in a batch will be channel size * channel repetitions
const int out_channel_size = src_dims.z * dst_repeats.z;

// source batch index for based on current output pos
batch_index = (pos.z / out_channel_size) % src_dims.w;

// source channel index is current output pos wrapped based on channel count
channel_index = pos.z % src_dims.z;
}

// input texel's WCB position
const ivec3 in_pos = ivec3(width_index, height_index, channel_index);

// squeeze position in packed dim
pos[packed_dim] >>= 2;

// packed dim index of texel last fetched
int fetched_in_pos_packed_dim = -1;

// fetched input texel
VEC4_T in_value;

// output texel value
VEC4_T out_value = VEC4_T(0);

int src_lane_offset = in_pos[packed_dim];

for (int i=0; i<4; i++) {
if ((src_lane_offset >> 2) != fetched_in_pos_packed_dim) {
fetched_in_pos_packed_dim = (src_lane_offset >> 2);

ivec3 curr_in_pos = in_pos;
curr_in_pos[packed_dim] = src_lane_offset;
curr_in_pos.z = curr_in_pos.z + batch_index * channel_size;
curr_in_pos[packed_dim] >>= 2;

in_value = VEC4_T(load_texel_lpos(t_in, curr_in_pos, in_axis_map));
}

out_value[i] = in_value[src_lane_offset & 0x3];

src_lane_offset++;
// if packed index exceeded source packed dim round to zero
src_lane_offset = mix(src_lane_offset, 0, src_lane_offset >= src_dims[packed_dim]);
}

write_texel_lpos(
t_out,
pos,
out_value,
out_axis_map);
}
14 changes: 14 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/repeat.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
repeat:
parameter_names_with_default_values:
DTYPE: float
NDIM: 3
STORAGE: texture3d
generate_variant_forall:
DTYPE:
- VALUE: half
- VALUE: float
- VALUE: int
- VALUE: int8
- VALUE: uint8
shader_variants:
- NAME: repeat
94 changes: 44 additions & 50 deletions backends/vulkan/runtime/graph/ops/impl/Copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,21 +71,17 @@ void add_copy_packed_dim_offset_node(
const ivec3& range,
const ivec4& src_offset,
const ivec4& dst_offset,
const ValueRef out,
bool repeat) {
const ValueRef out) {
vTensorPtr t_in = graph.get_tensor(in);
vTensorPtr t_out = graph.get_tensor(out);

// Check the packed dimension is same for both tensors
VK_CHECK_COND(check_same_packed_dim(*t_in, *t_out));
if (!repeat) {
// For non repeat copy also check if the packed dimension is Width or
// Height. Since the function does not support channel packing.
VK_CHECK_COND(
check_same_packed_dim(*t_in, *t_out) &&
(check_packed_dim_is(*t_in, WHCN::kWidthDim) ||
check_packed_dim_is(*t_in, WHCN::kHeightDim)));
}
// Check the packed dimension is same for both tensors, also check if the
// packed dimension is Width or Height. Since the function does not support
// channel packing.
VK_CHECK_COND(
check_same_packed_dim(*t_in, *t_out) &&
(check_packed_dim_is(*t_in, WHCN::kWidthDim) ||
check_packed_dim_is(*t_in, WHCN::kHeightDim)));

std::string kernel_name = "copy_packed_dim_offset";
kernel_name.reserve(kShaderNameReserve);
Expand All @@ -96,43 +92,41 @@ void add_copy_packed_dim_offset_node(
range[0], range[1], range[2], dim_at(t_in->sizes(), kBatch4D)};
ivec3 global_wg_size = t_out->logical_limits();

if (!repeat) {
const auto packed_dim = t_in->packed_dim();
// The starting offset in a texel where this tensor will start copying from
const auto src_lane_offset = src_offset[packed_dim] & 0x3;
// The starting offset in a texel where this tensor will start copying to
const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;

// The total packed texels this tensor will be copied from
// The first texel of tensor data in packed dimension will be copied from
// remaining lanes from current source Hence (4 - src_lane_offset) is added
// to tensor size in packed dimension
const auto src_packed_size = utils::div_up_4(
(4 - src_lane_offset) +
dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));

// The total packed texels this tensor will be copied to
// The first texel of tensor data in packed dimension will be copied to
// remaining lanes from previous write Hence (4 - dst_lane_offset) is added
// to tensor size in packed dimension
const auto dst_packed_size = utils::div_up_4(
(4 - dst_lane_offset) +
dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));

// If the starting src offset is not 0, and the total packed texels is
// greater than the source texel range
const bool has_additional_src_work =
src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
// If the starting dst offset is not 0, and the total packed texels is
// greater than the source texel range
const bool has_additional_dst_work =
dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];

if (has_additional_src_work || has_additional_dst_work) {
global_wg_size[packed_dim]++; // Increase the global work group size in
// packed dimension
final_range[packed_dim]++; // Increase the range in packed dimension
}
const auto packed_dim = t_in->packed_dim();
// The starting offset in a texel where this tensor will start copying from
const auto src_lane_offset = src_offset[packed_dim] & 0x3;
// The starting offset in a texel where this tensor will start copying to
const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;

// The total packed texels this tensor will be copied from
// The first texel of tensor data in packed dimension will be copied from
// remaining lanes from current source Hence (4 - src_lane_offset) is added
// to tensor size in packed dimension
const auto src_packed_size = utils::div_up_4(
(4 - src_lane_offset) +
dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));

// The total packed texels this tensor will be copied to
// The first texel of tensor data in packed dimension will be copied to
// remaining lanes from previous write Hence (4 - dst_lane_offset) is added
// to tensor size in packed dimension
const auto dst_packed_size = utils::div_up_4(
(4 - dst_lane_offset) +
dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));

// If the starting src offset is not 0, and the total packed texels is
// greater than the source texel range
const bool has_additional_src_work =
src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
// If the starting dst offset is not 0, and the total packed texels is
// greater than the source texel range
const bool has_additional_dst_work =
dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];

if (has_additional_src_work || has_additional_dst_work) {
global_wg_size[packed_dim]++; // Increase the global work group size in
// packed dimension
final_range[packed_dim]++; // Increase the range in packed dimension
}

auto shader = VK_KERNEL_FROM_STR(kernel_name);
Expand All @@ -151,7 +145,7 @@ void add_copy_packed_dim_offset_node(
// Parameter buffers
{},
// Specialization Constants
{graph.hashed_layout_of(out), graph.hashed_layout_of(in), repeat ? 1 : 0},
{graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
nullptr,
{},
{
Expand Down
Loading
Loading