diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
index ab15658111f..50b60ad956d 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
@@ -70,9 +70,9 @@ void main() {
   int kx = 0;
   for (int y = start.y; y < end.y; y += params.dilation.y) {
     for (int x = start.x; x < end.x; x += params.dilation.x) {
-      // The weight kernel was rearranged so that every NxN filter is flattened
-      // to fits in one row. Each filter was then stacked on top of each other
-      // vertically.
+      // The weight kernel was rearranged such that every NxN filter is
+      // flattened to fit in one row. Each filter was then stacked on top of
+      // each other vertically.
       const ${VEC4_T[DTYPE]} in_texel = texelFetch(image_in, ivec3(x, y, pos.z), 0);
       sum = fma(in_texel, texelFetch(kernel_in, ivec2(kx, pos.z), 0), sum);
       ++kx;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
new file mode 100644
index 00000000000..470eef6cdeb
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
+layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
+
+layout(set = 0, binding = 4) uniform PRECISION restrict OutExtents {
+  uvec4 data;
+}
+out_extents;
+
+layout(set = 0, binding = 5) uniform PRECISION restrict InExtents {
+  uvec4 data;
+}
+in_extents;
+
+layout(set = 0, binding = 6) uniform PRECISION restrict Params {
+  ivec2 kernel_size;
+  ivec2 stride;
+  ivec2 padding;
+  ivec2 dilation;
+}
+params;
+
+// If fields are separated, SwiftShader cannot identify in_group_size.
+layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
+  ivec2 overlay_region;
+  int in_group_size;
+}
+extra_params;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * Computes a depthwise convolution. Each shader invocation calculates the
+ * output at a single output location.
+ */
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_extents.data.xyz))) {
+    return;
+  }
+
+  // Compute the index of the top-left element of the overlay region. Negative
+  // indices indicate that the top-left element is in a region added by padding.
+  const ivec2 ipos = pos.xy * params.stride - params.padding;
+
+  // Compute the start and end of the input indices to load. Padding is assumed
+  // to be constant 0 padding, so any reads from the padding region is skipped.
+  const ivec2 start = ipos;
+  const ivec2 end = ipos + extra_params.overlay_region.xy;
+
+  ${VEC4_T[DTYPE]} sum = texelFetch(bias_in, ivec2(pos.z, 0), 0);
+  int kx = 0;
+  for (int y = start.y, i = 0; i < ${TILE_SIZE}; y += params.dilation.y, i++) {
+    for (int x = start.x, j = 0; j < ${TILE_SIZE}; x += params.dilation.x, j++) {
+      // The weight kernel was rearranged such that every NxN filter is
+      // flattened to fit in one row. Each filter was then stacked on top of
+      // each other vertically.
+      const vec4 in_texel = texelFetch(image_in, ivec3(x, y, pos.z), 0);
+      sum = fma(in_texel, texelFetch(kernel_in, ivec2(kx, pos.z), 0), sum);
+      kx++;
+    }
+  }
+
+  imageStore(image_out, pos, sum);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
new file mode 100644
index 00000000000..1d4405e0276
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_dw_output_tile:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    TILE_SIZE: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+        SUFFIX: half
+      - VALUE: float
+        SUFFIX: float
+  shader_variants:
+    - NAME: conv2d_dw_output_tile_3x3
+    - NAME: conv2d_dw_output_tile_5x5
+      TILE_SIZE: 5
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2d.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv2d.cpp
index d0b7c89ea5a..c9cf81101be 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Conv2d.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Conv2d.cpp
@@ -34,12 +34,12 @@ void resize_conv2d_node(
   if (ndim == 4) {
     new_out_sizes.at(ndim - 4) = self.sizes().at(ndim - 4);
   }
-  const auto weight_sizes = graph->get_val(extra_args[0]).toTensorRef().sizes;
+  const auto& weight_sizes = graph->get_val(extra_args[0]).toTensorRef().sizes;
   new_out_sizes.at(ndim - 3) =
       transposed ? weight_sizes.at(ndim - 3) : weight_sizes.at(ndim - 4);
 
   // Height, Width
-  const auto new_out_sizes_hw = calc_out_sizes_hw(
+  const auto& new_out_sizes_hw = calc_out_sizes_hw(
       *graph,
       self.sizes(),
       extra_args[0],
@@ -87,13 +87,24 @@ enum class Conv2dMethod : uint8_t {
 };
 
 api::ShaderInfo get_conv2d_shader(
+    ComputeGraph& graph,
     const vTensor& t_out,
     const bool prepack_weights,
-    const Conv2dMethod method) {
+    const Conv2dMethod method,
+    const ValueRef weight) {
   std::stringstream kernel_name;
   switch (method) {
     case Conv2dMethod::Depthwise:
       kernel_name << "conv2d_dw";
+      if (!prepack_weights) {
+        const auto& weight_sizes = graph.get_val(weight).toTensorRef().sizes;
+        if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {
+          kernel_name << "_output_tile_3x3";
+        }
+        if (weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5) {
+          kernel_name << "_output_tile_5x5";
+        }
+      }
       break;
     case Conv2dMethod::SlidingWindow:
       kernel_name << "conv2d";
@@ -156,7 +167,7 @@ ValueRef prepack_weights(
     const ValueRef vref,
     const Conv2dMethod method) {
   const auto original_sizes = graph.get_val(vref).toTensorRef().sizes;
-  const auto final_sizes = get_final_sizes(original_sizes, method);
+  const auto& final_sizes = get_final_sizes(original_sizes, method);
 
   ValueRef v = graph.add_tensor(
       final_sizes,
@@ -169,9 +180,9 @@ ValueRef prepack_weights(
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
   api::ShaderInfo shader =
-      get_conv2d_shader(t, /*prepack_weights = */ true, method);
+      get_conv2d_shader(graph, t, /*prepack_weights = */ true, method, vref);
 
-  const auto padded_sizes = get_padded_sizes(original_sizes, method);
+  const auto& padded_sizes = get_padded_sizes(original_sizes, method);
 
   graph.prepack_nodes().emplace_back(new PrepackNode(
       graph,
@@ -210,13 +221,13 @@ Conv2dParams create_conv2d_params(
     const ValueRef weight,
     const KernelParams& p,
     const bool transposed) {
-  const auto overlay_region = api::utils::make_ivec2({
+  const auto& overlay_region = api::utils::make_ivec2({
       p.kernel_size.data[0] +
           (p.kernel_size.data[0] - 1) * (p.dilation.data[0] - 1),
       p.kernel_size.data[1] +
           (p.kernel_size.data[1] - 1) * (p.dilation.data[1] - 1),
   });
-  const auto weight_sizes = graph.get_val(weight).toTensorRef().sizes;
+  const auto& weight_sizes = graph.get_val(weight).toTensorRef().sizes;
   const int32_t in_group_size =
       api::utils::safe_downcast<int32_t>(api::utils::align_up(
           transposed ? weight_sizes.at(0) : weight_sizes.at(1), INT64_C(4)));
@@ -244,7 +255,7 @@ Conv2dMethod get_conv2d_method(
     const ValueRef weight,
     const int64_t groups,
     const bool transposed) {
-  const auto weight_sizes = graph.get_val(weight).toTensorRef().sizes;
+  const auto& weight_sizes = graph.get_val(weight).toTensorRef().sizes;
   if (!transposed && weight_sizes.at(0) == groups && weight_sizes.at(1) == 1) {
     return Conv2dMethod::Depthwise;
   }
@@ -298,8 +309,8 @@ void add_conv2d_node(
 
   check_conv2d_params(kernel_params, transposed_val);
 
-  api::ShaderInfo shader =
-      get_conv2d_shader(t_out, /*prepack_weights = */ false, method);
+  api::ShaderInfo shader = get_conv2d_shader(
+      graph, t_out, /*prepack_weights = */ false, method, weight);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
index a6ac2a1cb87..795e13940d6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
@@ -35,7 +35,7 @@ void resize_max_pool2d_node(
   new_out_sizes.at(ndim - 3) = self.sizes().at(ndim - 3);
 
   // Height, Width
-  const auto new_out_sizes_hw = calc_out_sizes_hw(
+  const auto& new_out_sizes_hw = calc_out_sizes_hw(
       *graph,
       self.sizes(),
       extra_args[0],