diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize.glslh b/backends/vulkan/runtime/graph/ops/glsl/quantize.glslh
new file mode 100644
index 00000000000..cde72e41ac7
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/quantize.glslh
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef QUANTIZE_GLSLH
+#define QUANTIZE_GLSLH
+
+OUT_T quantize_val(IN_T value, float scale_val, int zero_point_val) {
+  float inv_scale = 1.0 / scale_val;
+
+  float rounded_float = round(inv_scale * float(value));
+
+  int qvalue = zero_point_val + int(rounded_float);
+
+  qvalue = max(qvalue, quant_min);
+  qvalue = min(qvalue, quant_max);
+
+  return OUT_T(qvalue);
+}
+
+#endif // QUANTIZE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl
new file mode 100644
index 00000000000..ea0c2f7dce7
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define IN_T ${buffer_scalar_type(IN_DTYPE)}
+#define OUT_T ${buffer_scalar_type(OUT_DTYPE)}
+
+#define ${MODE}
+
+${define_active_storage_type("buffer")}
+${define_required_extensions(IN_DTYPE)}
+${define_required_extensions(OUT_DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "buffer")}
+
+$if MODE == "per_tensor":
+  layout(push_constant) uniform restrict Block {
+    float scale;
+    int zero_point;
+    int quant_min;
+    int quant_max;
+  };
+$if MODE == "per_token":
+  ${layout_declare_tensor(B, "r", "t_scale", "float", "buffer")}
+  ${layout_declare_tensor(B, "r", "t_zero_point", "int", "buffer")}
+
+  layout(push_constant) uniform restrict Block {
+    int num_tokens;
+    int quant_min;
+    int quant_max;
+  };
+
+${layout_declare_ubo(B, "int", "out_numel")}
+${layout_declare_ubo(B, "ivec4", "t_in_sizes")}
+${layout_declare_ubo(B, "ivec4", "t_in_strides")}
+${layout_declare_ubo(B, "ivec4", "t_out_sizes")}
+${layout_declare_ubo(B, "ivec4", "t_out_strides")}
+
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+
+#include "quantize.glslh"
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
+const lowp ivec4 in_dim_order = unhash_dim_order(in_layout);
+
+/*
+ * QUANTIZATION SHADER (BUFFER STORAGE)
+ *
+ * This shader converts floating-point tensor values to n-bit integer representations
+ * using pre-computed quantization parameters (scale and zero_point). The quantization
+ * maps floating-point values to a discrete integer range while preserving the
+ * original data distribution as much as possible.
+ *
+ * ALGORITHM:
+ * 1. Load floating-point input value from buffer
+ * 2. Apply quantization formula: qvalue = round(value / scale) + zero_point
+ * 3. Clamp result to [quant_min, quant_max] range
+ * 4. Store quantized integer value to output buffer
+ *
+ * WORKGROUP CONFIGURATION:
+ * - Per-Tensor Mode:
+ *   - Global WG Size: {num_elements, 1, 1} (one thread per tensor element)
+ *   - Local WG Size: Default (typically {64, 1, 1} or based on global WG size)
+ * - Per-Token Mode:
+ *   - Global WG Size: {num_elements, 1, 1} (one thread per tensor element)
+ *   - Local WG Size: Default (typically {64, 1, 1} or based on global WG size)
+ *
+ * SUPPORTED CONFIGURATIONS:
+ * - Per-Tensor Config: Uses linear buffer indexing with stride-based tensor access
+ * - and supports any tensor layout through stride calculations and dimension ordering
+ * - Per-Token Config: Assumes width-packed layout (packed_dim = 0)
+ * - since that is how token index is calculated
+ *
+ * QUANTIZATION FORMULA VISUALIZATION:
+ * For input range [min_val, max_val] mapped to integer range [quant_min, quant_max]:
+ *
+ * Floating Point Domain:    Integer Domain:
+ * min_val ────────────────► quant_min
+ *    │                         │
+ *    │    scale = (max_val - min_val) / (quant_max - quant_min)
+ *    │    zero_point = quant_min - round(min_val / scale)
+ *    │                         │
+ * max_val ────────────────► quant_max
+ *
+ * Quantization Process:
+ * Input: 2.5 (float)
+ * Step 1: value / scale = 2.5 / 0.1 = 25.0
+ * Step 2: round(25.0) + zero_point = 25 + (-128) = -103
+ * Step 3: clamp(-103, -128, 127) = -103
+ * Output: -103 (int8)
+ *
+ * PER-TENSOR QUANTIZATION:
+ * - Single scale and zero_point values for entire tensor
+ * - All elements use same quantization parameters
+ * - Parameters passed as push constants for efficiency
+ * - Formula: qvalue = clamp(round(value / scale) + zero_point, quant_min, quant_max)
+ *
+ * PER-TOKEN QUANTIZATION:
+ * - Separate scale and zero_point for each token
+ * - Token = all elements except last dimension (e.g., for [B,S,H]: B*S tokens of H elements)
+ * - Parameters stored in buffer arrays indexed by token_id
+ * - Each thread calculates its token_id from tensor coordinates
+ * - Formula: qvalue = clamp(round(value / scale[token_id]) + zero_point[token_id], quant_min, quant_max)
+ */
+
+#ifdef per_tensor
+
+void quantize_per_tensor() {
+  const int out_bufi = int(gl_GlobalInvocationID.x);
+
+  if (out_bufi >= out_numel) {
+    return;
+  }
+
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order);
+  const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides);
+
+  IN_T value = t_in[in_bufi];
+  OUT_T qvalue = quantize_val(value, scale, zero_point);
+
+  t_out[out_bufi] = qvalue;
+}
+
+#else
+
+void quantize_per_token() {
+  const int out_bufi = int(gl_GlobalInvocationID.x);
+
+  if (out_bufi >= out_numel) {
+    return;
+  }
+
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order);
+  const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides);
+
+  IN_T value = t_in[in_bufi];
+
+  int token_idx = 0;
+
+  if (t_out_sizes.w > 1) {
+    // 4D tensor
+    token_idx = out_tidx.w * (t_out_sizes.z * t_out_sizes.y) + out_tidx.z * t_out_sizes.y + out_tidx.y;
+  } else if (t_out_sizes.z > 1) {
+    // 3D tensor
+    token_idx = out_tidx.z * t_out_sizes.y + out_tidx.y;
+  } else if (t_out_sizes.y > 1) {
+    // 2D tensor
+    token_idx = out_tidx.y;
+  }
+  // For 1D tensor, token_idx remains 0
+
+  token_idx = min(token_idx, num_tokens - 1);
+
+  OUT_T qvalue = quantize_val(value, t_scale[token_idx], t_zero_point[token_idx]);
+
+  t_out[out_bufi] = qvalue;
+}
+
+#endif
+
+void main() {
+  quantize_${MODE}();
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml
new file mode 100644
index 00000000000..90af2590936
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml
@@ -0,0 +1,18 @@
+quantize_buffer:
+  parameter_names_with_default_values:
+    IN_DTYPE: float
+    OUT_DTYPE: int32
+    MODE: per_tensor
+  generate_variant_forall:
+    IN_DTYPE:
+      - VALUE: half
+      - VALUE: float
+    OUT_DTYPE:
+      - VALUE: uint8
+      - VALUE: int8
+      - VALUE: int32
+  shader_variants:
+    - NAME: quantize_per_tensor_buffer
+      MODE: per_tensor
+    - NAME: quantize_per_token_buffer
+      MODE: per_token
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl
new file mode 100644
index 00000000000..9ba7074f75b
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define IN_T ${buffer_scalar_type(IN_DTYPE)}
+#define FVEC4_T ${texel_load_type(IN_DTYPE, "texture3d")}
+
+#define OUT_T ${buffer_scalar_type(OUT_DTYPE)}
+#define IVEC4_T ${texel_load_type(OUT_DTYPE, "texture3d")}
+
+#define ${MODE}
+
+${define_active_storage_type("texture3d")}
+${define_required_extensions(IN_DTYPE)}
+${define_required_extensions(OUT_DTYPE)}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "texture3d")}
+
+$if MODE == "per_tensor":
+  layout(push_constant) uniform restrict Block {
+    float scale;
+    int zero_point;
+    int quant_min;
+    int quant_max;
+  };
+$if MODE == "per_token":
+  ${layout_declare_tensor(B, "r", "t_scale", "float", "buffer")}
+  ${layout_declare_tensor(B, "r", "t_zero_point", "int", "buffer")}
+
+  layout(push_constant) uniform restrict Block {
+    int num_tokens;
+    int quant_min;
+    int quant_max;
+  };
+
+${layout_declare_ubo(B, "ivec3", "t_in_limits")}
+${layout_declare_ubo(B, "ivec3", "t_out_limits")}
+
+#include "indexing_utils.h"
+#include "quantize.glslh"
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * QUANTIZATION SHADER (TEXTURE STORAGE)
+ *
+ * This shader converts floating-point tensor values to n-bit integer representations
+ * using pre-computed quantization parameters (scale and zero_point). The quantization
+ * maps floating-point values to a discrete integer range while preserving the
+ * original data distribution as much as possible.
+ *
+ * ALGORITHM:
+ * 1. Load floating-point texel (4 values) from 3D texture
+ * 2. Apply quantization formula to each component: qvalue = round(value / scale) + zero_point
+ * 3. Clamp each result to [quant_min, quant_max] range
+ * 4. Store quantized integer texel to output texture
+ *
+ * WORKGROUP CONFIGURATION:
+ * - Per-Tensor Mode:
+ *   - Global WG Size: {W, H, C/4} for input size (W, H, C) with width-packing
+ *   - Local WG Size: Default (typically {8, 8, 1} or based on global WG size)
+ * - Per-Token Mode:
+ *   - Global WG Size: {W, H, C/4} for input size (W, H, C) with width-packing
+ *   - Local WG Size: Default (typically {8, 8, 1} or based on global WG size)
+ *
+ * SUPPORTED CONFIGURATIONS:
+ * - Texture Storage: Uses 3D texture indexing with texel-based processing
+ * - Assumes width-packed layout (packed_dim = 0) in current implementation
+ * - Handles texel padding for non-multiple-of-4 tensor dimensions
+ * - For per-token mode: scale/zero_point tensors must use buffer storage
+ *
+ * QUANTIZATION FORMULA VISUALIZATION:
+ * For input range [min_val, max_val] mapped to integer range [quant_min, quant_max]:
+ *
+ * Floating Point Domain:    Integer Domain:
+ * min_val ────────────────► quant_min
+ *    │                         │
+ *    │    scale = (max_val - min_val) / (quant_max - quant_min)
+ *    │    zero_point = quant_min - round(min_val / scale)
+ *    │                         │
+ * max_val ────────────────► quant_max
+ *
+ * Texel Quantization Process:
+ * Input Texel: [2.5, -1.0, 0.5, 3.2] (float4)
+ * Per-component quantization with scale=0.1, zero_point=-128:
+ * Component 0: round(2.5 / 0.1) + (-128) = 25 + (-128) = -103
+ * Component 1: round(-1.0 / 0.1) + (-128) = -10 + (-128) = -138 → clamp to -128
+ * Component 2: round(0.5 / 0.1) + (-128) = 5 + (-128) = -123
+ * Component 3: round(3.2 / 0.1) + (-128) = 32 + (-128) = -96
+ * Output Texel: [-103, -128, -123, -96] (int4)
+ *
+ * PER-TENSOR QUANTIZATION:
+ * - Single scale and zero_point values for entire tensor
+ * - All texel components use same quantization parameters
+ * - Parameters passed as push constants for efficiency
+ * - Each thread processes one texel (4 elements) independently
+ * - Formula: qvalue[i] = clamp(round(value[i] / scale) + zero_point, quant_min, quant_max)
+ *
+ * PER-TOKEN QUANTIZATION:
+ * - Separate scale and zero_point for each token
+ * - Token = all elements except last dimension (e.g., for [B,S,H]: B*S tokens of H elements)
+ * - Parameters stored in buffer arrays indexed by token_id
+ * - Each thread calculates token_id from its 3D texture position
+ * - Scale/zero_point buffers accessed directly (not as textures)
+ * - Formula: qvalue[i] = clamp(round(value[i] / scale[token_id]) + zero_point[token_id], quant_min, quant_max)
+ */
+
+#ifdef per_tensor
+
+void quantize_per_tensor() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, t_in_limits))) {
+    return;
+  }
+
+  FVEC4_T intex = load_texel(t_in, pos);
+  IVEC4_T outtex;
+
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    IN_T value = IN_T(intex[i]);
+    OUT_T qvalue = quantize_val(value, scale, zero_point);
+    outtex[i] = qvalue;
+  }
+  write_texel(t_out, pos, outtex);
+}
+
+#else
+
+void quantize_per_token() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, t_in_limits))) {
+    return;
+  }
+
+  FVEC4_T intex = load_texel(t_in, pos);
+
+  int token_idx = 0;
+  ivec3 dims = t_in_limits;
+
+  if (dims.z > 1) {
+    // 3D tensor
+    token_idx = pos.z * dims.y + pos.y;
+  } else if (dims.y > 1) {
+    // 2D tensor
+    token_idx = pos.y;
+  }
+  // For 1D tensor, token_idx remains 0
+
+  token_idx = min(token_idx, num_tokens - 1);
+
+  // Scale and zero_point are prepacked as buffers, so direct access
+  float scale_val = t_scale[token_idx];
+  int zero_point_val = t_zero_point[token_idx];
+
+  IVEC4_T outtex;
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    IN_T value = IN_T(intex[i]);
+    OUT_T qvalue = quantize_val(value, scale_val, zero_point_val);
+    outtex[i] = qvalue;
+  }
+
+  write_texel(t_out, pos, outtex);
+}
+
+#endif
+
+void main() {
+  quantize_${MODE}();
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml
new file mode 100644
index 00000000000..042eb0f8196
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml
@@ -0,0 +1,18 @@
+quantize_texture:
+  parameter_names_with_default_values:
+    IN_DTYPE: float
+    OUT_DTYPE: int32
+    MODE: per_tensor
+  generate_variant_forall:
+    IN_DTYPE:
+      - VALUE: half
+      - VALUE: float
+    OUT_DTYPE:
+      - VALUE: uint8
+      - VALUE: int8
+      - VALUE: int32
+  shader_variants:
+    - NAME: quantize_per_tensor_texture3d
+      MODE: per_tensor
+    - NAME: quantize_per_token_texture3d
+      MODE: per_token
diff --git a/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp
new file mode 100644
index 00000000000..35712d59fb9
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+namespace vkcompute {
+
+namespace {
+
+void resize_quantize_output(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)extra_args;
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef in = args.at(1).refs.at(0);
+  graph->virtual_resize(out, graph->sizes_of(in));
+}
+
+} // namespace
+
+void add_quantize_per_tensor_node(
+    ComputeGraph& graph,
+    const ValueRef& input,
+    const ValueRef& scale,
+    const ValueRef& zero_point,
+    const ValueRef& quant_min,
+    const ValueRef& quant_max,
+    const ValueRef& output) {
+  std::string kernel_name("quantize_per_tensor");
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(input));
+  add_dtype_suffix(kernel_name, graph.dtype_of(input));
+  add_dtype_suffix(kernel_name, graph.dtype_of(output));
+
+  float scale_val = static_cast<float>(graph.get_double(scale));
+  int zero_point_val = static_cast<int>(graph.get_int(zero_point));
+  int quant_min_val = static_cast<int>(graph.get_int(quant_min));
+  int quant_max_val = static_cast<int>(graph.get_int(quant_max));
+
+  vkapi::ParamsBindList param_ubos;
+  std::vector<PushConstantDataInfo> push_constants;
+
+  if (graph.is_buffer_storage(input)) {
+    param_ubos = {
+        graph.numel_ubo(input),
+        graph.sizes_ubo(input),
+        graph.strides_ubo(input),
+        graph.sizes_ubo(output),
+        graph.strides_ubo(output)};
+    push_constants = {
+        PushConstantDataInfo(&scale_val, sizeof(float)),
+        PushConstantDataInfo(&zero_point_val, sizeof(int)),
+        PushConstantDataInfo(&quant_min_val, sizeof(int)),
+        PushConstantDataInfo(&quant_max_val, sizeof(int)),
+    };
+  } else {
+    param_ubos = {
+        graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)};
+    push_constants = {
+        PushConstantDataInfo(&scale_val, sizeof(float)),
+        PushConstantDataInfo(&zero_point_val, sizeof(int)),
+        PushConstantDataInfo(&quant_min_val, sizeof(int)),
+        PushConstantDataInfo(&quant_max_val, sizeof(int)),
+    };
+  }
+
+  vkapi::SpecVarList spec_vars = {
+      graph.hashed_layout_of(output),
+      graph.hashed_layout_of(input),
+  };
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
+      // Inputs and Outputs
+      {{output, vkapi::kWrite}, {input, vkapi::kRead}},
+      // Shader param buffers
+      param_ubos,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      spec_vars,
+      // Resize Args
+      {},
+      // Resizing Logic
+      resize_quantize_output));
+}
+
+void add_quantize_per_token_node(
+    ComputeGraph& graph,
+    const ValueRef& input,
+    const ValueRef& scale,
+    const ValueRef& zero_point,
+    const ValueRef& quant_min,
+    const ValueRef& quant_max,
+    const ValueRef& output) {
+  std::string kernel_name("quantize_per_token");
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(input));
+  add_dtype_suffix(kernel_name, graph.dtype_of(input));
+  add_dtype_suffix(kernel_name, graph.dtype_of(output));
+
+  int quant_min_val = static_cast<int>(graph.get_int(quant_min));
+  int quant_max_val = static_cast<int>(graph.get_int(quant_max));
+
+  int num_tokens = static_cast<int>(graph.sizes_of(scale)[0]);
+
+  vkapi::ParamsBindList param_ubos;
+  std::vector<PushConstantDataInfo> push_constants;
+
+  if (graph.is_buffer_storage(input)) {
+    param_ubos = {
+        graph.numel_ubo(input),
+        graph.sizes_ubo(input),
+        graph.strides_ubo(input),
+        graph.sizes_ubo(output),
+        graph.strides_ubo(output),
+    };
+    push_constants = {
+        PushConstantDataInfo(&num_tokens, sizeof(int)),
+        PushConstantDataInfo(&quant_min_val, sizeof(int)),
+        PushConstantDataInfo(&quant_max_val, sizeof(int)),
+    };
+  } else {
+    param_ubos = {
+        graph.logical_limits_ubo(input),
+        graph.logical_limits_ubo(output),
+    };
+    push_constants = {
+        PushConstantDataInfo(&num_tokens, sizeof(int)),
+        PushConstantDataInfo(&quant_min_val, sizeof(int)),
+        PushConstantDataInfo(&quant_max_val, sizeof(int)),
+    };
+  }
+
+  vkapi::SpecVarList spec_vars = {
+      graph.hashed_layout_of(output),
+      graph.hashed_layout_of(input),
+  };
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
+      // Inputs and Outputs
+      {{output, vkapi::kWrite},
+       {input, vkapi::kRead},
+       {{scale, zero_point}, vkapi::kRead}},
+      // Shader param buffers
+      param_ubos,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      spec_vars,
+      // Resize Args
+      {},
+      // Resizing Logic
+      resize_quantize_output));
+}
+
+void quantize_per_tensor_impl(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  int arg_idx = 0;
+  const ValueRef input = args[arg_idx++];
+  const ValueRef scale = args[arg_idx++];
+  const ValueRef zero_point = args[arg_idx++];
+  const ValueRef quant_min = args[arg_idx++];
+  const ValueRef quant_max = args[arg_idx++];
+  const ValueRef output = args[arg_idx++];
+
+  // Check tensor types
+  VK_CHECK_COND(graph.val_is_tensor(input));
+  VK_CHECK_COND(graph.val_is_tensor(output));
+
+  // Verify input is a floating point type
+  VK_CHECK_COND(
+      graph.dtype_of(input) == vkapi::kFloat ||
+      graph.dtype_of(input) == vkapi::kHalf);
+
+  add_quantize_per_tensor_node(
+      graph, input, scale, zero_point, quant_min, quant_max, output);
+}
+
+void quantize_per_token_impl(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  int arg_idx = 0;
+  const ValueRef input = args[arg_idx++];
+  const ValueRef scale = args[arg_idx++];
+  const ValueRef zero_point = args[arg_idx++];
+  const ValueRef quant_min = args[arg_idx++];
+  const ValueRef quant_max = args[arg_idx++];
+  const ValueRef output = args[arg_idx++];
+
+  // Check tensor types
+  VK_CHECK_COND(graph.val_is_tensor(input));
+  VK_CHECK_COND(graph.val_is_tensor(scale));
+  VK_CHECK_COND(graph.val_is_tensor(zero_point));
+  VK_CHECK_COND(graph.val_is_tensor(output));
+
+  // Verify input is a floating point type
+  VK_CHECK_COND(
+      graph.dtype_of(input) == vkapi::kFloat ||
+      graph.dtype_of(input) == vkapi::kHalf);
+
+  // Check that scale and zero_point have buffer storage and width packing
+  VK_CHECK_COND(graph.is_buffer_storage(scale));
+  VK_CHECK_COND(graph.packed_dim_of(scale) == WHCN::kWidthDim);
+  VK_CHECK_COND(graph.is_buffer_storage(zero_point));
+  VK_CHECK_COND(graph.packed_dim_of(zero_point) == WHCN::kWidthDim);
+
+  // Check that tensors with texture storage have standard axis map
+  if (!graph.is_buffer_storage(input)) {
+    VK_CHECK_COND(graph.has_standard_axis_map(input));
+  }
+  if (!graph.is_buffer_storage(output)) {
+    VK_CHECK_COND(graph.has_standard_axis_map(output));
+  }
+
+  // Calculate number of tokens (product of all dimensions except the last one)
+  int64_t num_tokens = 1;
+  const auto input_sizes = graph.sizes_of(input);
+  for (size_t i = 0; i < input_sizes.size() - 1; i++) {
+    num_tokens *= input_sizes[i];
+  }
+
+  const auto scale_sizes = graph.sizes_of(scale);
+  const auto zero_point_sizes = graph.sizes_of(zero_point);
+
+  VK_CHECK_COND(scale_sizes.size() == 1);
+  VK_CHECK_COND(zero_point_sizes.size() == 1);
+  VK_CHECK_COND(scale_sizes[0] == num_tokens);
+  VK_CHECK_COND(zero_point_sizes[0] == num_tokens);
+
+  add_quantize_per_token_node(
+      graph, input, scale, zero_point, quant_min, quant_max, output);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(quantize_per_tensor.default, quantize_per_tensor_impl);
+  VK_REGISTER_OP(quantize_per_token.default, quantize_per_token_impl);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/quantize_test.cpp b/backends/vulkan/test/op_tests/quantize_test.cpp
index 8b79dc1ce6b..7ea98b14fb2 100644
--- a/backends/vulkan/test/op_tests/quantize_test.cpp
+++ b/backends/vulkan/test/op_tests/quantize_test.cpp
@@ -21,6 +21,9 @@
 
 #include <cassert>
 #include <iostream>
+#include <limits>
+
+float eps = 1e-7;
 
 namespace torch {
 namespace executor {
@@ -383,6 +386,8 @@ void test_reference_quantize_per_tensor(
   // Reshape back to original dimensions
   input = flat_input.reshape(input_sizes_int64);
 
+  scale = scale < eps ? eps : scale;
+
   // Get reference output
   at::Tensor reference_out = quantize_per_tensor_reference_impl(
       input, scale, zero_point, quant_min, quant_max, dtype);
@@ -435,6 +440,8 @@ void test_vulkan_quantize_per_tensor_impl(
   at::Tensor input =
       at::rand(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype));
 
+  scale = scale < eps ? eps : scale;
+
   // Get reference output
   at::Tensor reference_out = torch::executor::native::quantize_per_tensor_aten(
       input, scale, zero_point, quant_min, quant_max, dtype);
@@ -490,7 +497,7 @@ void test_vulkan_quantize_per_tensor_impl(
   at::Tensor reference_int = reference_out.to(at::kInt);
   at::Tensor vk_int = vk_out.to(at::kInt);
 
-  const bool output_correct = at::equal(reference_int, vk_int);
+  const bool output_correct = at::allclose(reference_int, vk_int);
   if (!output_correct) {
     at::Tensor diffs = at::abs(reference_int - vk_int);
 
@@ -500,6 +507,10 @@ void test_vulkan_quantize_per_tensor_impl(
     std::cout << "  zero_point: " << zero_point << std::endl;
     std::cout << "  quant_min: " << quant_min << std::endl;
     std::cout << "  quant_max: " << quant_max << std::endl;
+    std::cout << "  storage type: "
+              << (in_storage == vkcompute::utils::kBuffer ? "buffer"
+                                                          : "texture")
+              << std::endl;
 
     std::cout << "input:" << std::endl;
     std::cout << input << std::endl;
@@ -564,9 +575,89 @@ TEST(
       at::kInt);
 }
 
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_vulkan_quantize_per_tensor_float_to_uint8) {
+  if (!vkcompute::api::context()
+           ->adapter_ptr()
+           ->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
+  test_vulkan_quantize_per_tensor(
+      {5, 3, 2, 4}, // input sizes
+      0.01, // scale
+      1, // zero_point
+      0, // quant_min
+      255, // quant_max
+      at::kFloat,
+      at::kByte);
+}
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_vulkan_quantize_per_tensor_float_to_int8) {
+  if (!vkcompute::api::context()
+           ->adapter_ptr()
+           ->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
+  test_vulkan_quantize_per_tensor(
+      {5, 3, 2, 4}, // input sizes
+      0.01, // scale
+      1, // zero_point
+      -128, // quant_min
+      127, // quant_max
+      at::kFloat,
+      at::kChar);
+}
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_vulkan_quantize_per_tensor_float_to_int32) {
+  test_vulkan_quantize_per_tensor(
+      {5, 3, 2, 4}, // input sizes
+      0.01, // scale
+      1, // zero_point
+      -2147483648, // quant_min
+      2147483647, // quant_max
+      at::kFloat,
+      at::kInt);
+}
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_vulkan_quantize_per_tensor_float_to_int32_small_scale) {
+  test_vulkan_quantize_per_tensor(
+      {2, 8, 1, 3}, // input sizes
+      0.0, // scale
+      20, // zero_point
+      -2147483648, // quant_min
+      2147483647, // quant_max
+      at::kFloat,
+      at::kInt);
+}
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_vulkan_quantize_per_tensor_half_to_int8) {
+  if (!vkcompute::api::context()
+           ->adapter_ptr()
+           ->has_full_float16_buffers_support()) {
+    GTEST_SKIP();
+  }
+  test_vulkan_quantize_per_tensor(
+      {2, 3}, // input sizes
+      0.01, // scale
+      1, // zero_point
+      -128, // quant_min
+      127, // quant_max
+      at::kHalf, // input dtype
+      at::kChar); // output dtype
+}
+
 void test_reference_quantize_per_token(
     const std::vector<int>& input_sizes,
-    const std::vector<float>& scales,
+    const std::vector<float>& pre_scales,
     const std::vector<int>& zero_points,
     int64_t quant_min,
     int64_t quant_max,
@@ -595,9 +686,14 @@ void test_reference_quantize_per_token(
   }
 
   // Verify that the number of tokens matches the size of scales and zero_points
-  ASSERT_EQ(num_tokens, scales.size());
+  ASSERT_EQ(num_tokens, pre_scales.size());
   ASSERT_EQ(num_tokens, zero_points.size());
 
+  std::vector<float> scales = pre_scales;
+  for (auto& s : scales) {
+    s = s < eps ? eps : s;
+  }
+
   // Create scale and zero_point tensors
   at::Tensor scale_tensor =
       at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble));
@@ -646,7 +742,7 @@ void test_reference_quantize_per_token(
 
 void test_vulkan_quantize_per_token_impl(
     const std::vector<int>& input_sizes,
-    const std::vector<float>& scales,
+    const std::vector<float>& pre_scales,
     const std::vector<int>& zero_points,
     int64_t quant_min,
     int64_t quant_max,
@@ -662,9 +758,14 @@ void test_vulkan_quantize_per_token_impl(
     num_tokens *= input_sizes[i];
   }
 
-  ASSERT_EQ(num_tokens, scales.size());
+  ASSERT_EQ(num_tokens, pre_scales.size());
   ASSERT_EQ(num_tokens, zero_points.size());
 
+  std::vector<float> scales = pre_scales;
+  for (auto& s : scales) {
+    s = s < eps ? eps : s;
+  }
+
   // Create input tensor with random values
   std::vector<int64_t> input_sizes_int64(
       input_sizes.begin(), input_sizes.end());
@@ -688,9 +789,15 @@ void test_vulkan_quantize_per_token_impl(
   IOValueRef r_input = graph.add_input_tensor(
       input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage);
   IOValueRef r_scale = graph.add_input_tensor(
-      scale_tensor.sizes().vec(), vkapi::kFloat, in_storage);
+      scale_tensor.sizes().vec(),
+      vkapi::kFloat,
+      utils::kBuffer,
+      utils::kWidthPacked);
   IOValueRef r_zero_point = graph.add_input_tensor(
-      zero_point_tensor.sizes().vec(), vkapi::kInt, in_storage);
+      zero_point_tensor.sizes().vec(),
+      vkapi::kInt,
+      utils::kBuffer,
+      utils::kWidthPacked);
 
   const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
   const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);
@@ -744,7 +851,7 @@ void test_vulkan_quantize_per_token_impl(
   at::Tensor reference_int = reference_out.to(at::kInt);
   at::Tensor vk_int = vk_out.to(at::kInt);
 
-  const bool output_correct = at::equal(reference_int, vk_int);
+  const bool output_correct = at::allclose(reference_int, vk_int);
   if (!output_correct) {
     at::Tensor diffs = at::abs(reference_int - vk_int);
 
@@ -841,3 +948,130 @@ TEST(
       at::kHalf,
       at::kByte);
 }
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_vulkan_quantize_per_token_float_to_uint8) {
+  if (!vkcompute::api::context()
+           ->adapter_ptr()
+           ->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
+  std::vector<float> scales = {
+      -0.5, -0.3, -0.2, 0, 0.1, 0.8, 0.1, 0.2, 0.3, 0.4};
+  std::vector<int> zero_points = {-8, 0, 15, 20, 19, 12, 47, 1, -50, -12};
+
+  test_vulkan_quantize_per_token(
+      {5, 2, 4}, // input sizes (5*2=10 tokens)
+      scales,
+      zero_points,
+      0, // quant_min
+      255, // quant_max
+      at::kFloat,
+      at::kByte);
+}
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_vulkan_quantize_per_token_float_to_int8) {
+  if (!vkcompute::api::context()
+           ->adapter_ptr()
+           ->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
+  std::vector<float> scales = {
+      -0.5, -0.3, -0.2, 0, 0.1, 0.8, 0.1, 0.2, 0.3, 0.4};
+  std::vector<int> zero_points = {-8, 0, 15, 20, 19, 12, 47, 1, -50, -12};
+
+  test_vulkan_quantize_per_token(
+      {5, 2, 4}, // input sizes (5 tokens)
+      scales,
+      zero_points,
+      -128, // quant_min
+      127, // quant_max
+      at::kFloat,
+      at::kChar);
+}
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_vulkan_quantize_per_token_float_to_int32) {
+  std::vector<float> scales = {
+      -0.5, -0.3, -0.2, 0, 0.1, 0.8, 0.1, 0.2, 0.3, 0.4};
+  std::vector<int> zero_points = {-8, 0, 15, 20, 19, 12, 47, 1, -50, -12};
+
+  test_vulkan_quantize_per_token(
+      {5, 2, 4}, // input sizes (5*2=10 tokens)
+      scales,
+      zero_points,
+      -2147483648, // quant_min
+      2147483647, // quant_max
+      at::kFloat,
+      at::kInt);
+}
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_vulkan_quantize_per_token_float_to_int32_small_scales) {
+  std::vector<float> scales = {
+      0,
+      2.9387358770557188e-39f,
+      1.40129846e-45f,
+      1.17549435e-38f,
+      0.0000000000001};
+  std::vector<int> zero_points = {20, -10, 15, 200, 50};
+
+  test_vulkan_quantize_per_token(
+      {5, 2}, // input sizes (3 tokens)
+      scales,
+      zero_points,
+      -2147483648, // quant_min
+      2147483647, // quant_max
+      at::kFloat,
+      at::kInt);
+}
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_vulkan_quantize_per_token_float_to_uint8_many_tokens) {
+  if (!vkcompute::api::context()
+           ->adapter_ptr()
+           ->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
+  std::vector<float> scales(18, 0.1);
+  std::vector<int> zero_points(18, 5);
+
+  // Alternate scale values
+  for (size_t i = 0; i < scales.size(); i++) {
+    scales[i] = (i % 2 == 0) ? 0.3 : -0.5;
+  }
+
+  test_vulkan_quantize_per_token(
+      {3, 3, 2, 3}, // input sizes (3*3*2=18 tokens)
+      scales,
+      zero_points,
+      0, // quant_min
+      125, // quant_max
+      at::kFloat,
+      at::kByte);
+}
+
+TEST(VulkanQuantizePerTensorTest, test_vulkan_quantize_per_token_half_to_int8) {
+  if (!vkcompute::api::context()
+           ->adapter_ptr()
+           ->has_full_float16_buffers_support()) {
+    GTEST_SKIP();
+  }
+  std::vector<float> scales = {0.1, 0.2};
+  std::vector<int> zero_points = {0, 5};
+
+  test_vulkan_quantize_per_token(
+      {2, 2}, // input sizes (2*2=4 tokens)
+      scales,
+      zero_points,
+      -128, // quant_min
+      127, // quant_max
+      at::kHalf, // input dtype
+      at::kChar); // output dtype
+}
diff --git a/backends/vulkan/test/op_tests/test_utils.cpp b/backends/vulkan/test/op_tests/test_utils.cpp
index 196f079be2c..c5702abd079 100644
--- a/backends/vulkan/test/op_tests/test_utils.cpp
+++ b/backends/vulkan/test/op_tests/test_utils.cpp
@@ -94,7 +94,8 @@ vkcompute::vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) {
     case c10::kInt:
       return vkapi::kInt;
     case c10::kLong:
-      return vkapi::kLong;
+      // No support for 64-bit integers
+      return vkapi::kInt;
     case c10::kChar:
       return vkapi::kChar;
     case c10::kByte: