Lowbit custom torch op

metascroy · HDCharles · commit f4c81093ea61 · 2024-09-08T11:04:32.000-07:00
Differential Revision: D61896155 Pull Request resolved: #780
diff --git a/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/CMakeLists.txt b/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/CMakeLists.txt
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+project(examples)
+
+cmake_minimum_required(VERSION 3.19)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+
+add_compile_options("-Wall" "-Werror")
+
+include(CMakePrintHelpers)
+message("TORCHAO_LIBRARIES: ${TORCHAO_LIBRARIES}")
+include_directories(${TORCHAO_LIBRARIES})
+
+add_library(
+  torchao_dep
+  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp
+  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp
+  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp
+  ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/valpacking/interleave.cpp
+)
+
+include(FetchContent)
+FetchContent_Declare(pthreadpool
+        GIT_REPOSITORY https://github.com/Maratyszcza/pthreadpool.git
+        GIT_TAG master)
+FetchContent_MakeAvailable(
+  pthreadpool)
+
+find_package(Torch REQUIRED)
+message("TORCH_INCLUDE_DIRS: ${TORCH_INCLUDE_DIRS}")
+include_directories("${TORCH_INCLUDE_DIRS}")
+
+add_library(torch_custom_op SHARED torch_custom_op.cpp)
+target_link_libraries(torch_custom_op PRIVATE "${TORCH_LIBRARIES}")
+target_link_libraries(torch_custom_op PRIVATE torchao_dep)
+target_compile_definitions(torch_custom_op PRIVATE TORCHAO_PARALLEL_PTHREADPOOL=1)
+target_link_libraries(torch_custom_op PRIVATE pthreadpool)
diff --git a/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/build_custom_op.sh b/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/build_custom_op.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
+export TORCHAO_LIBRARIES=${SCRIPT_DIR}/../../../../../../..
+
+export CMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')"
+echo "CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}"
+export CMAKE_OUT=/tmp/cmake-out/torch_ao/examples/torch_custom_op
+cmake -DTORCHAO_LIBRARIES=${TORCHAO_LIBRARIES} \
+    -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH} \
+    -S ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op \
+    -B ${CMAKE_OUT}
+cmake --build  ${CMAKE_OUT}
diff --git a/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/run_custom_op.py b/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/run_custom_op.py
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torch_custom_op import quantize, replace_linear_with_quantized_linear
+import torch
+import copy
+
+group_size = 16
+m = 1
+n = 4096
+k = 4096
+nbit = 4
+n_layers = 10
+
+print("Creating random model")
+layers = [torch.nn.Linear(k, n, bias=False) for _ in range(n_layers)]
+model = torch.nn.Sequential(*layers)
+model = model.eval()
+
+print("Quantizing random model")
+quantized_model = copy.deepcopy(model)
+quantized_model =  quantized_model.eval()
+replace_linear_with_quantized_linear(quantized_model, kwargs={"group_size": group_size, "nbit": nbit})
+
+print("Creating random activations")
+activations = torch.randn(m, k, dtype=torch.float32)
+
+print("Exporting quantized model")
+exported = torch.export.export(quantized_model, (activations,))
+
+print("Using torch.compile on quantized model")
+quantized_model_compiled = torch.compile(quantized_model)
+with torch.no_grad():
+    quantized_model_compiled(activations)
+
+print("Compiling quantized model with AOTI")
+torch._export.aot_compile(
+    quantized_model,
+    (activations,),
+    options={"aot_inductor.output_path": "/tmp/torch_custom_op_example_model.so"},
+)
+
+print("Running AOTI")
+fn = torch._export.aot_load("/tmp/torch_custom_op_example_model.so", "cpu")
+fn(activations)
+
+
+print("Checking correctness on layer 0")
+
+rtol=1e-05
+
+# default is 1e-8, but PyTorch and C++ (and ARM neon) have different rounding
+# conventions for ties (PyTorch rounds half to even and C++ rounds half to odd)
+# TODO(T200109708): address this
+atol=1e-05 
+
+linear = model[0]
+quantized_linear = quantized_model[0]
+weight_qvals, weight_scales = quantize(linear.weight, group_size, quantized_linear.nbit, scale_only=True)
+
+activation_qvals, activations_scales, activations_zeros = quantize(activations, k, 8, False)
+activations_dequantized = activations_scales * (activation_qvals - activations_zeros)
+weights_dequantized = (weight_qvals.reshape(-1, group_size) * weight_scales.reshape(-1, 1)).reshape(n, k)
+
+with torch.no_grad():
+    result = quantized_linear(activations)
+    expected_result = torch.matmul(activations_dequantized, weights_dequantized.transpose(1, 0))
+    non_quantized_result = linear(activations)
+
+if not (torch.allclose(result, expected_result, rtol=rtol, atol=atol)):
+    rand_idxs = torch.randint(0, result.shape[1], (5,))
+    print("rand_idxs: ", rand_idxs)
+    print("kernel_result[rand_idxs]: ", result[0][rand_idxs])
+    print("expected_result[rand_idxs]: ", expected_result[0][rand_idxs])
+    assert False
+else:
+    print("Correctness check passed")
+
+print("kernel_result[0:5]: ", result[0][0:5])
+print("non_quantized_result[0:5]: ", non_quantized_result[0][0:5])
diff --git a/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/torch_custom_op.cpp b/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/torch_custom_op.cpp
@@ -0,0 +1,212 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <torch/library.h>
+#include <torch/script.h>
+#include <torch/torch.h>
+#include <torchao/experimental/kernels/cpu/linear/channelwise_8bit_activation_groupwise_lowbit_weight.h>
+#include <torchao/experimental/kernels/cpu/parallel.h>
+
+template <int weight_nbit>
+at::Tensor pack_weights_cpu(
+    const at::Tensor& weight_qvals,
+    const at::Tensor& weight_scales,
+    // TODO(T200095131): convert to int64_t when supported by AOTI
+    // group_size is a meta tensor with size (group_size)
+    const at::Tensor& group_size_tensor) {
+  int64_t group_size = group_size_tensor.size(0);
+
+  TORCH_CHECK(
+      weight_qvals.dtype() == torch::kInt8, "weight_qvals must be int8");
+  TORCH_CHECK(weight_qvals.dim() == 2, "weight_qvals must be 2D");
+
+  // In PyTorch, weights are nxk in row-major format (with activations being
+  // right-multiplied).
+  // In kernel, activations are left-multiplied by kxn transposed
+  // weights in column-major format.
+  // Note the underlying data is the same in both cases
+  int n = weight_qvals.size(0);
+  int k = weight_qvals.size(1);
+
+  TORCH_CHECK(
+      weight_scales.dtype() == torch::kFloat32,
+      "weight_scales must be float32");
+  TORCH_CHECK(weight_scales.dim() == 1, "weight_scales must be 1D");
+  TORCH_CHECK(
+      weight_scales.size(0) == ((n * k) / group_size),
+      "expected 1 scale per group");
+
+  using namespace torchao::operators::cpu::linear::
+      channelwise_8bit_activation_groupwise_lowbit_weight;
+
+  auto ukernel_config = get_ukernel_config<
+      weight_nbit,
+      false /*has_weight_zeros*/,
+      false /*has_bias*/,
+      false /*has_clamp*/>();
+  auto pack_weight_tiling_params = get_default_pack_weight_data_tiling_params(
+      ukernel_config, n, /*target_panels_per_thread=*/1);
+
+  torchao::set_num_threads(torch::get_num_threads());
+
+  auto packed_weight_data_size =
+      get_packed_weight_data_size(ukernel_config, n, k, group_size);
+  auto options = torch::TensorOptions().dtype(torch::kInt8);
+
+  at::Tensor packed_weights = torch::empty({packed_weight_data_size}, options);
+  pack_weight_data_operator(
+      ukernel_config,
+      pack_weight_tiling_params,
+      packed_weights.data_ptr<int8_t>(),
+      n,
+      k,
+      group_size,
+      weight_qvals.const_data_ptr<int8_t>(),
+      weight_scales.const_data_ptr<float>(),
+      /*weight_zeros=*/nullptr);
+
+  return packed_weights;
+}
+
+template <int weight_nbit>
+at::Tensor pack_weights_meta(
+    const at::Tensor& weight_qvals,
+    const at::Tensor& weight_scales,
+    // TODO(T200095131): convert to int64_t when supported by AOTI
+    // group_size is a meta tensor with size (group_size)
+    const at::Tensor& group_size_tensor) {
+  int64_t group_size = group_size_tensor.size(0);
+
+  int n = weight_qvals.size(0);
+  int k = weight_qvals.size(1);
+
+  using namespace torchao::operators::cpu::linear::
+      channelwise_8bit_activation_groupwise_lowbit_weight;
+
+  auto ukernel_config = get_ukernel_config<
+      weight_nbit,
+      false /*has_weight_zeros*/,
+      false /*has_bias*/,
+      false /*has_clamp*/>();
+
+  auto packed_weight_data_size =
+      get_packed_weight_data_size(ukernel_config, n, k, group_size);
+  return torch::empty({packed_weight_data_size}).to("meta");
+}
+
+template <int weight_nbit>
+at::Tensor linear_cpu(
+    const at::Tensor& packed_weights,
+    // TODO(T200095131): convert n_tensor, k_tensor, group_size_tensor to
+    // int64_t when supported by AOTI Currently they are meta tensors with size
+    // equal to the int they wrap
+    const at::Tensor& n_tensor,
+    const at::Tensor& k_tensor,
+    const at::Tensor& group_size_tensor,
+    const at::Tensor& activations) {
+  int n = n_tensor.size(0);
+  int k = k_tensor.size(0);
+  int group_size = group_size_tensor.size(0);
+
+  TORCH_CHECK(
+      activations.dtype() == torch::kFloat32, "activations must be float32");
+  TORCH_CHECK(activations.dim() == 2, "activations must be 2D");
+  int m = activations.size(0);
+  int k_ = activations.size(1);
+  TORCH_CHECK(k == k_, "activation shape is incompatible with packed weights.");
+
+  using namespace torchao::operators::cpu::linear::
+      channelwise_8bit_activation_groupwise_lowbit_weight;
+
+  auto ukernel_config = get_ukernel_config<
+      weight_nbit,
+      false /*has_weight_zeros*/,
+      false /*has_bias*/,
+      false /*has_clamp*/>();
+  auto linear_tiling_params = get_default_linear_tiling_params(
+      ukernel_config,
+      m,
+      n,
+      /*target_tiles_per_thread=*/5);
+  auto linear_scheduling_policy =
+      LinearTileSchedulingPolicy::single_mc_parallel_nc;
+
+  torchao::set_num_threads(torch::get_num_threads());
+
+  auto activation_data_buffer_size = get_activation_data_buffer_size(
+      ukernel_config,
+      linear_tiling_params,
+      linear_scheduling_policy,
+      m,
+      k,
+      group_size);
+  std::vector<char> activation_data_buffer(activation_data_buffer_size);
+
+  at::Tensor output_tensor = torch::empty({m, n}, torch::kFloat32);
+  linear_operator(
+      ukernel_config,
+      linear_tiling_params,
+      linear_scheduling_policy,
+      activation_data_buffer.data(),
+      output_tensor.data_ptr<float>(),
+      m,
+      n,
+      k,
+      group_size,
+      packed_weights.const_data_ptr<int8_t>(),
+      activations.const_data_ptr<float>(),
+      /*bias=*/nullptr,
+      // Clamp parameters are ignored because config is created from
+      // has_clamp = false
+      /*clamp_min=*/0.0,
+      /*clamp_max=*/0.0);
+
+  return output_tensor;
+}
+
+template <int weight_nbit>
+at::Tensor linear_meta(
+    const at::Tensor& packed_weights,
+    // TODO(T200095131): convert n_tensor, k_tensor, group_size_tensor to
+    // int64_t when supported by AOTI
+    // Currently they are meta tensors with size equal to the int they wrap
+    const at::Tensor& n_tensor,
+    const at::Tensor& k_tensor,
+    const at::Tensor& group_size_tensor,
+    const at::Tensor& activations) {
+  int n = n_tensor.size(0);
+  int k = k_tensor.size(0);
+
+  int m = activations.size(0);
+  int k_ = activations.size(1);
+  TORCH_CHECK(k == k_, "activation shape is incompatible with packed weights.");
+  return torch::empty({m, n}).to("meta");
+}
+
+TORCH_LIBRARY(torchao, m) {
+  m.def(
+      "_pack_weights_3bit(Tensor weight_qvals, Tensor weight_scales, Tensor group_size) -> Tensor");
+  m.def(
+      "_linear_3bit(Tensor packed_weights, Tensor n, Tensor k, Tensor group_size, Tensor activations) -> Tensor");
+  m.def(
+      "_pack_weights_4bit(Tensor weight_qvals, Tensor weight_scales, Tensor group_size) -> Tensor");
+  m.def(
+      "_linear_4bit(Tensor packed_weights, Tensor n, Tensor k, Tensor group_size, Tensor activations) -> Tensor");
+}
+
+TORCH_LIBRARY_IMPL(torchao, CPU, m) {
+  m.impl("_pack_weights_3bit", &pack_weights_cpu<3>);
+  m.impl("_linear_3bit", &linear_cpu<3>);
+  m.impl("_pack_weights_4bit", &pack_weights_cpu<4>);
+  m.impl("_linear_4bit", &linear_cpu<4>);
+}
+
+TORCH_LIBRARY_IMPL(torchao, Meta, m) {
+  m.impl("_pack_weights_3bit", &pack_weights_meta<3>);
+  m.impl("_linear_3bit", &linear_meta<3>);
+  m.impl("_pack_weights_4bit", &pack_weights_meta<4>);
+  m.impl("_linear_4bit", &linear_meta<4>);
+}
diff --git a/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/torch_custom_op.py b/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/torch_custom_op.py