diff --git a/core/conversion/conversion.cpp b/core/conversion/conversion.cpp index f8a26e8d77..25f6d5da5c 100644 --- a/core/conversion/conversion.cpp +++ b/core/conversion/conversion.cpp @@ -202,13 +202,7 @@ void AddInputs(ConversionCtx* ctx, c10::ArrayRef input TORCHTRT_CHECK( profile->isValid(), "Optimization profile is invalid, please check the input range provided (conversion.AddInputs)"); - ctx->cfg->addOptimizationProfile(profile); -#if NV_TENSORRT_MAJOR > 7 || (NV_TENSORRT_MAJOR == 7 && NV_TENSORRT_MINOR >= 1) - if (ctx->enabled_precisions.find(nvinfer1::DataType::kINT8) != ctx->enabled_precisions.end()) { - ctx->cfg->setCalibrationProfile(profile); - } -#endif } void MarkOutputs(ConversionCtx* ctx, at::ArrayRef outputs) { diff --git a/core/conversion/conversionctx/ConversionCtx.cpp b/core/conversion/conversionctx/ConversionCtx.cpp index 2eb363706f..c0dbacabc5 100644 --- a/core/conversion/conversionctx/ConversionCtx.cpp +++ b/core/conversion/conversionctx/ConversionCtx.cpp @@ -31,8 +31,7 @@ std::ostream& operator<<(std::ostream& os, const BuilderSettings& s) { if (s.device.device_type == nvinfer1::DeviceType::kDLA) { os << "\n DLACore: " << s.device.dla_core; } - os << "\n Engine Capability: " << s.capability \ - << "\n Calibrator Created: " << (s.calibrator != nullptr); + os << "\n Engine Capability: " << s.capability; return os; } // clang-format on @@ -64,15 +63,7 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings) cfg->setFlag(nvinfer1::BuilderFlag::kFP16); break; case nvinfer1::DataType::kINT8: - TORCHTRT_CHECK( - builder->platformHasFastInt8(), "Requested inference in INT8 but platform does not support INT8"); - cfg->setFlag(nvinfer1::BuilderFlag::kINT8); - if (!settings.calibrator) { - LOG_INFO( - "Int8 precision has been enabled but no calibrator provided. This assumes the network has Q/DQ nodes obtained from Quantization aware training. For more details, refer to https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#work-with-qat-networks"); - } else { - cfg->setInt8Calibrator(settings.calibrator); - } + LOG_DEBUG("INT8 precision has been enabled, we assume the network has Q/DQ nodes obtained from modelopt"); break; case nvinfer1::DataType::kFLOAT: break; diff --git a/core/conversion/conversionctx/ConversionCtx.h b/core/conversion/conversionctx/ConversionCtx.h index 8587885eca..0b5a09490b 100644 --- a/core/conversion/conversionctx/ConversionCtx.h +++ b/core/conversion/conversionctx/ConversionCtx.h @@ -26,7 +26,6 @@ struct BuilderSettings { bool allow_shape_tensors = false; ir::Device device; nvinfer1::EngineCapability capability = TRT_ENGINE_CAPABILITY_STANDARD; - nvinfer1::IInt8Calibrator* calibrator = nullptr; uint64_t num_avg_timing_iters = 1; uint64_t workspace_size = 0; uint64_t dla_sram_size = DLA_SRAM_SIZE; diff --git a/cpp/BUILD b/cpp/BUILD index e5cb1558e9..2dc87c6039 100644 --- a/cpp/BUILD +++ b/cpp/BUILD @@ -7,14 +7,12 @@ cc_library( srcs = [ "src/compile_spec.cpp", "src/logging.cpp", - "src/ptq.cpp", "src/torch_tensorrt.cpp", "src/types.cpp", ], hdrs = [ "include/torch_tensorrt/logging.h", "include/torch_tensorrt/macros.h", - "include/torch_tensorrt/ptq.h", "include/torch_tensorrt/torch_tensorrt.h", ], linkstatic = True, diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0c0e5a43f0..690dca2749 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -4,7 +4,6 @@ add_library(${lib_name} OBJECT) set(CXX_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/src/compile_spec.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/logging.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/src/ptq.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/torch_tensorrt.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/types.cpp" ) @@ -12,7 +11,6 @@ set(CXX_SRCS set(HEADER_FILES "${CMAKE_CURRENT_SOURCE_DIR}/include/torch_tensorrt/logging.h" "${CMAKE_CURRENT_SOURCE_DIR}/include/torch_tensorrt/macros.h" - "${CMAKE_CURRENT_SOURCE_DIR}/include/torch_tensorrt/ptq.h" "${CMAKE_CURRENT_SOURCE_DIR}/include/torch_tensorrt/torch_tensorrt.h" ) diff --git a/cpp/bin/torchtrtc/fileio.h b/cpp/bin/torchtrtc/fileio.h index ed52d566a1..c4d8bb50c4 100644 --- a/cpp/bin/torchtrtc/fileio.h +++ b/cpp/bin/torchtrtc/fileio.h @@ -23,7 +23,6 @@ #include "torch/torch.h" #include "torch_tensorrt/logging.h" -#include "torch_tensorrt/ptq.h" #include "torch_tensorrt/torch_tensorrt.h" namespace torchtrtc { diff --git a/cpp/bin/torchtrtc/main.cpp b/cpp/bin/torchtrtc/main.cpp index c36cfdd0fc..874cb96ef3 100644 --- a/cpp/bin/torchtrtc/main.cpp +++ b/cpp/bin/torchtrtc/main.cpp @@ -7,7 +7,6 @@ #include "torch/script.h" #include "torch_tensorrt/logging.h" -#include "torch_tensorrt/ptq.h" #include "torch_tensorrt/torch_tensorrt.h" #include "accuracy.h" @@ -335,8 +334,6 @@ int main(int argc, char** argv) { calibration_cache_file_path = torchtrtc::fileio::resolve_path(args::get(calibration_cache_file)); } - auto calibrator = torchtrt::ptq::make_int8_cache_calibrator(calibration_cache_file_path); - compile_settings.require_full_compilation = require_full_compilation; if (torch_executed_ops || torch_executed_mods) { @@ -367,13 +364,9 @@ int main(int argc, char** argv) { compile_settings.enabled_precisions.insert(torch::kF16); } else if (dtype == torchtrt::DataType::kChar) { compile_settings.enabled_precisions.insert(torch::kI8); - if (calibration_cache_file) { - compile_settings.ptq_calibrator = calibrator; - } else { - torchtrt::logging::log( - torchtrt::logging::Level::kINFO, - "Int8 precision has been enabled but no calibrator provided. This assumes the network has Q/DQ nodes obtained from Quantization aware training. For more details, refer to https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#work-with-qat-networks"); - } + torchtrt::logging::log( + torchtrt::logging::Level::kDEBUG, + "Int8 precision has been enabled which assumes the network has Q/DQ nodes obtained"); } else { std::stringstream ss; ss << "Invalid precision given for enabled kernel precision, options are [ float | float32 | f32 | fp32 | half | float16 | f16 | fp16 | char | int8 | i8 ], found: "; diff --git a/cpp/bin/torchtrtc/parser_util.h b/cpp/bin/torchtrtc/parser_util.h index 9ed5f6d06b..9cbb4ff994 100644 --- a/cpp/bin/torchtrtc/parser_util.h +++ b/cpp/bin/torchtrtc/parser_util.h @@ -9,7 +9,6 @@ #include "torch/torch.h" #include "torch_tensorrt/logging.h" -#include "torch_tensorrt/ptq.h" #include "torch_tensorrt/torch_tensorrt.h" namespace torchtrtc { diff --git a/cpp/include/torch_tensorrt/macros.h b/cpp/include/torch_tensorrt/macros.h index 020b94c114..e31091031b 100644 --- a/cpp/include/torch_tensorrt/macros.h +++ b/cpp/include/torch_tensorrt/macros.h @@ -30,9 +30,6 @@ STR(TORCH_TENSORRT_MAJOR_VERSION) \ "." STR(TORCH_TENSORRT_MINOR_VERSION) "." STR(TORCH_TENSORRT_PATCH_VERSION) -#define TORCH_TENSORRT_PTQ_DEPRECATION \ - [[deprecated( \ - "Int8 PTQ Calibrator has been deprecated by TensorRT, please plan on porting to a NVIDIA Model Optimizer Toolkit based workflow. See: https://pytorch.org/TensorRT/tutorials/_rendered_examples/dynamo/vgg16_ptq.html for more details")]] // Setup namespace aliases for ease of use namespace torch_tensorrt { namespace torchscript {} diff --git a/cpp/include/torch_tensorrt/ptq.h b/cpp/include/torch_tensorrt/ptq.h deleted file mode 100644 index a2f82346c0..0000000000 --- a/cpp/include/torch_tensorrt/ptq.h +++ /dev/null @@ -1,352 +0,0 @@ -/* - * Copyright (c) NVIDIA Corporation. - * All rights reserved. - * - * This library is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "NvInfer.h" -#include "torch/torch.h" -#include "torch_tensorrt/logging.h" -#include "torch_tensorrt/macros.h" - -#ifndef DOXYGEN_SHOULD_SKIP_THIS -namespace torch_tensorrt { -namespace ptq { -TORCHTRT_API bool get_batch_impl(void* bindings[], const char* names[], int nbBindings, torch::Tensor& data); -} -} // namespace torch_tensorrt -#endif // DOXYGEN_SHOULD_SKIP_THIS - -namespace torch_tensorrt { -namespace ptq { - -/** - * @brief Generic Int8Calibrator implementation based on a specified - * TensorRT calibration algorithm and a LibTorch DataLoader - * - * @tparam Algorithm: class nvinfer1::IInt8Calibrator (Default: - * nvinfer1::IInt8EntropyCalibrator2) - Algorithm to use - * @tparam DataLoaderUniquePtr: std::unique_ptr - - * DataLoader type - */ -template -class Int8Calibrator : Algorithm { - using DataLoader = typename DataLoaderUniquePtr::element_type; - using Batch = typename DataLoader::super::BatchType; - - public: - /** - * @brief Construct a new Int8Calibrator object - * - * Using the provided DataLoader, construct a calibrator that can be used for - * PTQ with Torch-TensorRT - * - * @param dataloader: std::unqiue_ptr - A unique - * pointer to the DataLoader, should be what is returned from the - * make_data_loader factory - * @param cache_file_path: const std::string& - A path to store / find the - * calibration cache - * @param use_cache : bool - Whether to use the cache (if it exists) - */ - Int8Calibrator(DataLoaderUniquePtr dataloader, const std::string& cache_file_path, bool use_cache) - : dataloader_(dataloader.get()), cache_file_path_(cache_file_path), use_cache_(use_cache) { - for (auto batch : *dataloader_) { - batched_data_.push_back(batch.data); - } - it_ = batched_data_.begin(); - } - - /** - * @brief Get the Batch Size for the next batch (always 1 due to issues with - * TRT and explicit batch) - * - * @return int - */ - int getBatchSize() const noexcept override { - // HACK: Torch-TensorRT only uses explict batch sizing, INT8 Calibrator does not - // work when reporting the batch size here and having explicity batching. - // So we just report batch size 1 (warnings will still be printed out). - return 1; - // return static_cast(dataloader_->options().batch_size); - } - - /** - * @brief Get the next Batch - * - * @param bindings: void*[] - An array of binding pointers (fed in from - * TensorRT calibrator), these buffers should be filed with batch data for - * each input - * @param names: const char*[] - Names of bindings - * @param nbBindings: int - Number of bindings - * @return true - There is a new batch for the calibrator to consume - * @return false - There is not a new batch for the calibrator to consume - */ - bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept override { - if (it_ != batched_data_.end()) { - auto status = get_batch_impl(bindings, names, nbBindings, *it_); - it_ = ++it_; - return status; - } else { - // Reset iterator if incase calibrator is going to be used again - it_ = batched_data_.begin(); - return false; - } - } - - /** - * @brief Read calibration cache - * - * How to read from the calibration cache, only enabled if use_cache is set - * - * @param length - * @return const void* - Pointer to cache data - */ - const void* readCalibrationCache(size_t& length) noexcept override { - if (use_cache_) { - std::stringstream ss; - ss << "Reading Calibration Cache from " << cache_file_path_; - logging::log(logging::Level::kINFO, ss.str()); - - cache_.clear(); - std::ifstream input(cache_file_path_, std::ios::binary); - input >> std::noskipws; - if (input.good()) { - std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(cache_)); - logging::log(logging::Level::kDEBUG, "Cache read"); - } - length = cache_.size(); - return length ? cache_.data() : nullptr; - } - return nullptr; - } - - /** - * @brief Write calibration cache - * - * Write a the calibration cache provided by TensorRT to a specified file - * - * @param cache: const void* - cache data - * @param length: size_t - length of cache - */ - void writeCalibrationCache(const void* cache, size_t length) noexcept override { - std::ofstream cache_file(cache_file_path_, std::ios::binary); - cache_file.write(reinterpret_cast(cache), length); - std::stringstream ss; - ss << "Saved Calibration Cache to " << cache_file_path_; - logging::log(logging::Level::kINFO, ss.str()); - } - - /** - * @brief operator to cast to nvinfer1::IInt8Calibrator* - * - * Convience function to convert to a IInt8Calibrator* to easily be assigned - * to the ptq_calibrator field in CompileSpec - * - * @return nvinfer1::IInt8Calibrator* - */ - operator nvinfer1::IInt8Calibrator*() { - return reinterpret_cast(this); - } - - private: - /// Pointer to the dataloader - DataLoader* dataloader_; - /// Path to cache file - const std::string& cache_file_path_; - /// Size of cache - size_t cache_size_ = 0; - /// Whether to use the cache or not - bool use_cache_; - /// Cache data - std::vector cache_; - /// Batched Data - std::vector batched_data_; - /// Iterator to move through dataset - std::vector::iterator it_; -}; - -/** - * @brief Generic Int8Calibrator implementation based on a specified - * TensorRT calibration algorithm that only reads from a calibration file - * - * @tparam Algorithm: class nvinfer1::IInt8Calibrator (Default: - * nvinfer1::IInt8EntropyCalibrator2) - Algorithm to use - */ -template -class Int8CacheCalibrator : Algorithm { - public: - /** - * @brief Construct a new Int 8 Cache Calibrator object - * - * @param cache_file_path - */ - Int8CacheCalibrator(const std::string& cache_file_path) : cache_file_path_(cache_file_path) {} - - /** - * @brief Get the Batch Size for the next batch (always 1 due to issues with - * TRT and explicit batch) - * - * @return int - */ - int getBatchSize() const noexcept override { - // HACK: Torch-TensorRT only uses explict batch sizing, INT8 Calibrator does not - // work when reporting the batch size here and having explicity batching. - // So we just report batch size 1 (warnings will still be printed out). - return 1; - } - - /** - * @brief Get the next Batch - * - * Not used always returns false - * - * @param bindings: void*[] - An array of binding pointers (fed in from - * TensorRT calibrator), these buffers should be filed with batch data for - * each input - * @param names: const char*[] - Names of bindings - * @param nbBindings: int - Number of bindings - * @return false - */ - bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept override { - return false; - } - - /** - * @brief Read calibration cache - * - * How to read from the calibration cache, only enabled if use_cache is set - * - * @param length - * @return const void* - Pointer to cache data - */ - const void* readCalibrationCache(size_t& length) noexcept override { - std::stringstream ss; - ss << "Reading Calibration Cache from " << cache_file_path_; - logging::log(logging::Level::kINFO, ss.str()); - - cache_.clear(); - std::ifstream input(cache_file_path_, std::ios::binary); - input >> std::noskipws; - if (input.good()) { - std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(cache_)); - logging::log(logging::Level::kDEBUG, "Cache read"); - } - length = cache_.size(); - return length ? cache_.data() : nullptr; - } - - /** - * @brief Write calibration cache - * - * Write a the calibration cache provided by TensorRT to a specified file - * - * @param cache: const void* - cache data - * @param length: size_t - length of cache - */ - void writeCalibrationCache(const void* cache, size_t length) noexcept override { - std::ofstream cache_file(cache_file_path_, std::ios::binary); - cache_file.write(reinterpret_cast(cache), length); - std::stringstream ss; - ss << "Saved Calibration Cache to " << cache_file_path_; - logging::log(logging::Level::kINFO, ss.str()); - } - - /** - * @brief operator to cast to nvinfer1::IInt8Calibrator* - * - * Convience function to convert to a IInt8Calibrator* to easily be assigned - * to the ptq_calibrator field in CompileSpec - * - * @return nvinfer1::IInt8Calibrator* - */ - operator nvinfer1::IInt8Calibrator*() { - return reinterpret_cast(this); - } - - private: - /// Path to cache file - const std::string& cache_file_path_; - /// Size of cache - size_t cache_size_ = 0; - /// Cache data - std::vector cache_; -}; - -/** - * @brief A factory to build a post training quantization calibrator from a - * torch dataloader - * - * Creates a calibrator to use for post training quantization. By default the - * returned calibrator uses TensorRT Entropy v2 algorithm to perform - * calibration. This is recommended for feed forward networks. You can override - * the algorithm selection (such as to use the MinMax Calibrator recomended for - * NLP tasks) by calling make_int8_calibrator with the calibrator class as a - * template parameter. - * - * e.g. - * ``torch_tensorrt::ptq::make_int8_calibrator(std::move(calibration_dataloader), - * calibration_cache_file, use_cache);`` - * @tparam Algorithm: class nvinfer1::IInt8Calibrator (Default: - * nvinfer1::IInt8EntropyCalibrator2) - Algorithm to use - * @tparam DataLoader: std::unique_ptr - DataLoader - * type - * @param dataloader: std::unique_ptr - DataLoader - * containing data - * @param cache_file_path: const std::string& - Path to read/write calibration - * cache - * @param use_cache: bool - use calibration cache - * @return Int8Calibrator - */ -template -TORCH_TENSORRT_PTQ_DEPRECATION inline Int8Calibrator make_int8_calibrator( - DataLoader dataloader, - const std::string& cache_file_path, - bool use_cache) { - return Int8Calibrator(std::move(dataloader), cache_file_path, use_cache); -} - -/** - * @brief A factory to build a post training quantization calibrator from a - * torch dataloader that only uses the calibration cache - * - * Creates a calibrator to use for post training quantization which reads from a - * previously created calibration cache, therefore you can have a calibration - * cache generating program that requires a dataloader and a dataset, then save - * the cache to use later in a different program that needs to calibrate from - * scratch and not have the dataset dependency. However, the network should also - * be recalibrated if its structure changes, or the input data set changes, and - * it is the responsibility of the application to ensure this. - * - * By default the returned calibrator uses TensorRT Entropy v2 algorithm to - * perform calibration. This is recommended for feed forward networks You can - * override the algorithm selection (such as to use the MinMax Calibrator - * recomended for NLP tasks) by calling make_int8_calibrator with the calibrator - * class as a template parameter. - * - * e.g. - * torch_tensorrt::ptq::make_int8_cache_calibrator(calibration_cache_file); - * @tparam Algorithm: class nvinfer1::IInt8Calibrator (Default: - * nvinfer1::IInt8EntropyCalibrator2) - Algorithm to use - * @param cache_file_path: const std::string& - Path to read/write calibration - * cache - * @return Int8CacheCalibrator - */ -template -TORCH_TENSORRT_PTQ_DEPRECATION inline Int8CacheCalibrator make_int8_cache_calibrator( - const std::string& cache_file_path) { - return Int8CacheCalibrator(cache_file_path); -} - -} // namespace ptq -} // namespace torch_tensorrt diff --git a/cpp/include/torch_tensorrt/torch_tensorrt.h b/cpp/include/torch_tensorrt/torch_tensorrt.h index adac75d984..8cf4449e75 100644 --- a/cpp/include/torch_tensorrt/torch_tensorrt.h +++ b/cpp/include/torch_tensorrt/torch_tensorrt.h @@ -34,9 +34,6 @@ template class ArrayRef; } // namespace c10 -namespace nvinfer1 { -class IInt8Calibrator; -} #endif // DOXYGEN_SHOULD_SKIP_THIS namespace torch_tensorrt { @@ -833,11 +830,6 @@ struct CompileSpec { */ uint64_t dla_global_dram_size = 536870912; - /** - * Calibration dataloaders for each input for post training quantizatiom - */ - nvinfer1::IInt8Calibrator* ptq_calibrator = nullptr; - /** * Require the full module be compiled to TensorRT instead of potentially running unsupported operations in PyTorch */ diff --git a/cpp/src/compile_spec.cpp b/cpp/src/compile_spec.cpp index 68a25b3912..6e5cf99850 100644 --- a/cpp/src/compile_spec.cpp +++ b/cpp/src/compile_spec.cpp @@ -156,15 +156,8 @@ torchtrt::core::CompileSpec to_internal_compile_spec(CompileSpec external, bool if (internal.convert_info.engine_settings.enabled_precisions.find(nvinfer1::DataType::kINT8) != internal.convert_info.engine_settings.enabled_precisions.end()) { internal.partitioning_info.cast_int8_inputs = false; - if (external.ptq_calibrator) { - internal.convert_info.engine_settings.calibrator = external.ptq_calibrator; - } else { - internal.lower_info.unfreeze_module = true; - internal.lower_info.disable_cse = true; - internal.convert_info.engine_settings.calibrator = nullptr; - } - } else { - internal.convert_info.engine_settings.calibrator = nullptr; + internal.lower_info.unfreeze_module = true; + internal.lower_info.disable_cse = true; } return internal; diff --git a/cpp/src/ptq.cpp b/cpp/src/ptq.cpp deleted file mode 100644 index 7d36e9ce7d..0000000000 --- a/cpp/src/ptq.cpp +++ /dev/null @@ -1,16 +0,0 @@ -#include "torch_tensorrt/ptq.h" -#include "torch/torch.h" - -namespace torch_tensorrt { -namespace ptq { - -bool get_batch_impl(void* bindings[], const char* names[], int nbBindings, torch::Tensor& data) { - for (int i = 0; i < nbBindings; i++) { - data = data.to(at::kCUDA).contiguous(); - bindings[i] = data.data_ptr(); - } - return true; -} - -} // namespace ptq -} // namespace torch_tensorrt diff --git a/py/BUILD.bazel b/py/BUILD.bazel index 57bde9cf0a..dcbb337132 100644 --- a/py/BUILD.bazel +++ b/py/BUILD.bazel @@ -13,7 +13,6 @@ py_library( "torch_tensorrt/_types.py", "torch_tensorrt/_version.py", "torch_tensorrt/logging.py", - "torch_tensorrt/ptq.py", ], data = [ "torch_tensorrt/lib/libtrtorch.so", diff --git a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp index bae61881da..04e9115cd1 100644 --- a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp +++ b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp @@ -67,7 +67,6 @@ void RegisterTRTCompileSpec() { .def("_set_precisions", &torch_tensorrt::pyapi::CompileSpec::setPrecisions) .def("_set_device", &torch_tensorrt::pyapi::CompileSpec::setDeviceIntrusive) .def("_set_torch_fallback", &torch_tensorrt::pyapi::CompileSpec::setTorchFallbackIntrusive) - .def("_set_ptq_calibrator", &torch_tensorrt::pyapi::CompileSpec::setPTQCalibratorViaHandle) .def("__str__", &torch_tensorrt::pyapi::CompileSpec::stringify); ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, sparse_weights); diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.cpp b/py/torch_tensorrt/csrc/tensorrt_classes.cpp index bd3aa6b305..788a45184b 100644 --- a/py/torch_tensorrt/csrc/tensorrt_classes.cpp +++ b/py/torch_tensorrt/csrc/tensorrt_classes.cpp @@ -343,16 +343,11 @@ core::CompileSpec CompileSpec::toInternalCompileSpec(bool converting_to_trt_engi info.partitioning_info.cast_int8_inputs = true; - if (ptq_calibrator) { - info.convert_info.engine_settings.calibrator = ptq_calibrator; + if (info.convert_info.engine_settings.enabled_precisions.find(nvinfer1::DataType::kINT8) != + info.convert_info.engine_settings.enabled_precisions.end()) { info.partitioning_info.cast_int8_inputs = false; - } else { - if (info.convert_info.engine_settings.enabled_precisions.find(nvinfer1::DataType::kINT8) != - info.convert_info.engine_settings.enabled_precisions.end()) { - info.partitioning_info.cast_int8_inputs = false; - info.lower_info.unfreeze_module = true; - info.lower_info.disable_cse = true; - } + info.lower_info.unfreeze_module = true; + info.lower_info.disable_cse = true; } info.convert_info.engine_settings.sparse_weights = sparse_weights; info.convert_info.engine_settings.disable_tf32 = disable_tf32; diff --git a/py/torch_tensorrt/csrc/tensorrt_classes.h b/py/torch_tensorrt/csrc/tensorrt_classes.h index 89c5c8661e..2bf8fe8f52 100644 --- a/py/torch_tensorrt/csrc/tensorrt_classes.h +++ b/py/torch_tensorrt/csrc/tensorrt_classes.h @@ -140,10 +140,6 @@ struct CompileSpec : torch::CustomClassHolder { } } - int64_t getPTQCalibratorHandle() { - return (int64_t)ptq_calibrator; - } - void setDeviceIntrusive(const c10::intrusive_ptr& d) { device = *d; } @@ -152,10 +148,6 @@ struct CompileSpec : torch::CustomClassHolder { torch_fallback = *fb; } - void setPTQCalibratorViaHandle(int64_t handle) { - ptq_calibrator = (nvinfer1::IInt8Calibrator*)handle; - } - ADD_FIELD_GET_SET(disable_tf32, bool); ADD_FIELD_GET_SET(sparse_weights, bool); ADD_FIELD_GET_SET(refit, bool); @@ -170,11 +162,9 @@ struct CompileSpec : torch::CustomClassHolder { ADD_FIELD_GET_SET(allow_shape_tensors, bool); ADD_FIELD_GET_SET(device, Device); ADD_FIELD_GET_SET(torch_fallback, TorchFallback); - ADD_FIELD_GET_SET(ptq_calibrator, nvinfer1::IInt8Calibrator*); std::vector inputs; InputSignature input_signature; - nvinfer1::IInt8Calibrator* ptq_calibrator = nullptr; std::set enabled_precisions = {}; bool sparse_weights = false; bool disable_tf32 = false; diff --git a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp index e32d102f8b..378f96cd0e 100644 --- a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp +++ b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp @@ -18,135 +18,6 @@ namespace py = pybind11; namespace torch_tensorrt { namespace pyapi { -template -class pyCalibratorTrampoline : public Derived { - public: - using Derived::Derived; // Inherit constructors - - int getBatchSize() const noexcept override { - try { - PYBIND11_OVERLOAD_PURE_NAME(int, Derived, "get_batch_size", getBatchSize); - } catch (std::exception const& e) { - LOG_ERROR("Exception caught in get_batch_size" + std::string(e.what())); - } catch (...) { - LOG_ERROR("Exception caught in get_batch_size"); - } - return -1; - } - - bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept override { - py::gil_scoped_acquire gil{}; - - py::function pyGetBatch = torch_tensorrt::pyapi::util::getOverload(static_cast(this), "get_batch"); - std::vector namesVec(names, names + nbBindings); - py::object result = pyGetBatch(namesVec); - // Copy over into the other data structure. - if (!result.is_none() && result.cast>().size() != 0) { - std::memcpy(bindings, result.cast>().data(), nbBindings * sizeof(void*)); - return true; - } - return false; - } - - const void* readCalibrationCache(std::size_t& length) noexcept override { - py::gil_scoped_acquire gil{}; - - py::function pyReadCalibrationCache = - torch_tensorrt::pyapi::util::getOverload(static_cast(this), "read_calibration_cache"); - py::buffer cache = pyReadCalibrationCache(); - if (!cache.is_none()) { - py::buffer_info info = cache.request(); - length = info.size * info.itemsize; - return info.ptr; - } - return nullptr; - } - - void writeCalibrationCache(const void* ptr, std::size_t length) noexcept override { - py::gil_scoped_acquire gil{}; - - py::function pyWriteCalibrationCache = - torch_tensorrt::pyapi::util::getOverload(static_cast(this), "write_calibration_cache"); - - py::memoryview cache{py::memoryview::from_buffer(static_cast(ptr), {length}, {sizeof(uint8_t)})}; - pyWriteCalibrationCache(cache); - } -}; - -class pyIInt8Calibrator : public pyCalibratorTrampoline { - public: - using Derived = pyCalibratorTrampoline; - using Derived::Derived; - - nvinfer1::InterfaceInfo getInterfaceInfo() const noexcept override { - return nvinfer1::InterfaceInfo{"PYTHON CALIBRATOR", 1, 0}; - } - - nvinfer1::CalibrationAlgoType getAlgorithm() noexcept override { - try { - PYBIND11_OVERLOAD_PURE_NAME( - nvinfer1::CalibrationAlgoType, nvinfer1::IInt8Calibrator, "get_algorithm", getAlgorithm); - } catch (std::exception const& e) { - LOG_ERROR("Exception caught in get_algorithm: " + std::string(e.what())); - } catch (...) { - LOG_ERROR("Exception caught in get_algorithm"); - } - return {}; - } -}; - -class pyIInt8LegacyCalibrator : public pyCalibratorTrampoline { - public: - using Derived = pyCalibratorTrampoline; - using Derived::Derived; - - double getQuantile() const noexcept override { - try { - PYBIND11_OVERLOAD_PURE_NAME(double, nvinfer1::IInt8LegacyCalibrator, "get_quantile", getQuantile); - } catch (std::exception const& e) { - LOG_ERROR("Exception caught in get_quantile: " + std::string(e.what())); - } catch (...) { - LOG_ERROR("Exception caught in get_quantile"); - } - return -1.0; - } - - double getRegressionCutoff() const noexcept override { - try { - PYBIND11_OVERLOAD_PURE_NAME( - double, nvinfer1::IInt8LegacyCalibrator, "get_regression_cutoff", getRegressionCutoff); - } catch (std::exception const& e) { - LOG_ERROR("Exception caught in get_regression_cutoff: " + std::string(e.what())); - } catch (...) { - LOG_ERROR("Exception caught in get_regression_cutoff"); - } - return -1.0; - } - - const void* readHistogramCache(std::size_t& length) noexcept override { - try { - PYBIND11_OVERLOAD_PURE_NAME( - const char*, nvinfer1::IInt8LegacyCalibrator, "read_histogram_cache", readHistogramCache, length); - } catch (std::exception const& e) { - LOG_ERROR("Exception caught in read_histogram_cache" + std::string(e.what())); - } catch (...) { - LOG_ERROR("Exception caught in read_histogram_cache"); - } - return {}; - } - - void writeHistogramCache(const void* ptr, std::size_t length) noexcept override { - try { - PYBIND11_OVERLOAD_PURE_NAME( - void, nvinfer1::IInt8LegacyCalibrator, "write_histogram_cache", writeHistogramCache, ptr, length); - } catch (std::exception const& e) { - LOG_ERROR("Exception caught in write_histogram_cache" + std::string(e.what())); - } catch (...) { - LOG_ERROR("Exception caught in write_histogram_cache"); - } - } -}; - void set_device(const int device_id) { core::set_device(device_id); } @@ -275,51 +146,6 @@ PYBIND11_MODULE(_C, m) { .value("channels_last", TensorFormat::kChannelsLast, "Channels last memory layout (NHWC)") .export_values(); - py::enum_(m, "CalibrationAlgo", py::module_local(), "Type of calibration algorithm") - .value("LEGACY_CALIBRATION", nvinfer1::CalibrationAlgoType::kLEGACY_CALIBRATION) - .value("ENTROPY_CALIBRATION", nvinfer1::CalibrationAlgoType::kENTROPY_CALIBRATION) - .value("ENTROPY_CALIBRATION_2", nvinfer1::CalibrationAlgoType::kENTROPY_CALIBRATION_2) - .value("MINMAX_CALIBRATION", nvinfer1::CalibrationAlgoType::kMINMAX_CALIBRATION); - - py::class_( - m, "IInt8Calibrator", py::module_local(), "Int8 Calibrator base class") - .def(py::init_alias<>()) // Always initialize trampoline class. - .def("get_batch_size", &nvinfer1::IInt8Calibrator::getBatchSize, "Get batch size") - .def("get_algorithm", &nvinfer1::IInt8Calibrator::getAlgorithm, "Get algorithm"); - - py::class_( - m, "IInt8LegacyCalibrator", py::module_local(), "Int8 Legacy Calibrator class") - .def(py::init_alias<>()) // Always initialize trampoline class. - .def("get_batch_size", &nvinfer1::IInt8LegacyCalibrator::getBatchSize, "Get batch size") - .def("get_algorithm", &nvinfer1::IInt8LegacyCalibrator::getAlgorithm, "Get algorithm"); - - py::class_< - nvinfer1::IInt8EntropyCalibrator, - nvinfer1::IInt8Calibrator, - pyCalibratorTrampoline>( - m, "IInt8EntropyCalibrator", py::module_local(), "Int8 Entropy Calibrator class") - .def(py::init_alias<>()) // Always initialize trampoline class. - .def("get_batch_size", &nvinfer1::IInt8EntropyCalibrator::getBatchSize, "Get batch size") - .def("get_algorithm", &nvinfer1::IInt8EntropyCalibrator::getAlgorithm, "Get algorithm"); - - py::class_< - nvinfer1::IInt8EntropyCalibrator2, - nvinfer1::IInt8Calibrator, - pyCalibratorTrampoline>( - m, "IInt8EntropyCalibrator2", py::module_local(), "Int8 Entropy Calibrator2 class") - .def(py::init_alias<>()) // Always initialize trampoline class. - .def("get_batch_size", &nvinfer1::IInt8EntropyCalibrator2::getBatchSize, "Get batch size") - .def("get_algorithm", &nvinfer1::IInt8EntropyCalibrator2::getAlgorithm, "Get algorithm"); - - py::class_< - nvinfer1::IInt8MinMaxCalibrator, - nvinfer1::IInt8Calibrator, - pyCalibratorTrampoline>( - m, "IInt8MinMaxCalibrator", py::module_local(), "Int8 MinMax Calibrator class") - .def(py::init_alias<>()) // Always initialize trampoline class. - .def("get_batch_size", &nvinfer1::IInt8MinMaxCalibrator::getBatchSize, "Get batch size") - .def("get_algorithm", &nvinfer1::IInt8MinMaxCalibrator::getAlgorithm, "Get algorithm"); - py::class_(m, "Device") .def(py::init<>()) .def("__str__", &torch_tensorrt::pyapi::Device::to_str) @@ -362,11 +188,9 @@ PYBIND11_MODULE(_C, m) { py::class_(ts_sub_mod, "CompileSpec") .def(py::init<>()) .def("__str__", &torch_tensorrt::pyapi::CompileSpec::stringify) - .def("_get_calibrator_handle", &CompileSpec::getPTQCalibratorHandle, "[Internal] gets a handle from a calibrator") .def_readwrite("inputs", &CompileSpec::inputs) .def_readwrite("input_signature", &CompileSpec::input_signature) .def_readwrite("enabled_precisions", &CompileSpec::enabled_precisions) - .def_readwrite("ptq_calibrator", &CompileSpec::ptq_calibrator) .def_readwrite("refit", &CompileSpec::refit) .def_readwrite("sparse_weights", &CompileSpec::sparse_weights) .def_readwrite("disable_tf32", &CompileSpec::disable_tf32) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 74cab980c4..608c8e84c9 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -148,7 +148,6 @@ def cross_compile_for_windows( dla_local_dram_size (int): Host RAM used by DLA to share intermediate tensor data across operations dla_global_dram_size (int): Host RAM used by DLA to store weights and metadata for execution truncate_double (bool): Truncate weights provided in double (float64) to float32 - calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration require_full_compilation (bool): Require modules to be compiled end to end or return an error as opposed to returning a hybrid graph where operations that cannot be run in TensorRT are run in PyTorch min_block_size (int): The minimum number of contiguous TensorRT convertible operations in order to run a set of operations in TensorRT torch_executed_ops (Collection[Target]): Set of aten operators that must be run in PyTorch. An error will be thrown if this set is not empty but ``require_full_compilation`` is True @@ -487,7 +486,6 @@ def compile( dla_local_dram_size (int): Host RAM used by DLA to share intermediate tensor data across operations dla_global_dram_size (int): Host RAM used by DLA to store weights and metadata for execution truncate_double (bool): Truncate weights provided in double (float64) to float32 - calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration require_full_compilation (bool): Require modules to be compiled end to end or return an error as opposed to returning a hybrid graph where operations that cannot be run in TensorRT are run in PyTorch min_block_size (int): The minimum number of contiguous TensorRT convertible operations in order to run a set of operations in TensorRT torch_executed_ops (Collection[Target]): Set of aten operators that must be run in PyTorch. An error will be thrown if this set is not empty but ``require_full_compilation`` is True @@ -1042,7 +1040,6 @@ def convert_exported_program_to_serialized_trt_engine( dla_sram_size: int = _defaults.DLA_SRAM_SIZE, dla_local_dram_size: int = _defaults.DLA_LOCAL_DRAM_SIZE, dla_global_dram_size: int = _defaults.DLA_GLOBAL_DRAM_SIZE, - calibrator: object = None, allow_shape_tensors: bool = False, timing_cache_path: str = _defaults.TIMING_CACHE_PATH, use_explicit_typing: bool = _defaults.USE_EXPLICIT_TYPING, @@ -1107,7 +1104,6 @@ def convert_exported_program_to_serialized_trt_engine( dla_sram_size (int): Fast software managed RAM used by DLA to communicate within a layer. dla_local_dram_size (int): Host RAM used by DLA to share intermediate tensor data across operations dla_global_dram_size (int): Host RAM used by DLA to store weights and metadata for execution - calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration allow_shape_tensors: (Experimental) Allow aten::size to output shape tensors using IShapeLayer in TensorRT timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation use_explicit_typing (bool): This flag enables strong typing in TensorRT compilation which respects the precisions set in the Pytorch model. This is useful when users have mixed precision graphs. diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py index f81f7cab32..6006484f19 100644 --- a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py @@ -6,6 +6,7 @@ import numpy as np import torch +from tensorrt import ITensor as TRTTensor from torch.fx.node import Argument, Node, Target from torch_tensorrt._utils import is_tensorrt_version_supported from torch_tensorrt.dynamo._settings import CompilationSettings @@ -22,7 +23,6 @@ get_positive_dim, is_only_operator_on_placeholder, ) -from torch_tensorrt.dynamo.types import TRTTensor _LOGGER: logging.Logger = logging.getLogger(__name__) diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index 53835ba1d5..7d7f4274ff 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -19,11 +19,10 @@ import numpy as np import tensorrt as trt import torch +import torch_tensorrt.dynamo.conversion.impl as impl from torch.fx.experimental.proxy_tensor import unset_fake_temporarily from torch.fx.node import Argument, Target from torch.fx.passes.shape_prop import TensorMetadata - -import torch_tensorrt.dynamo.conversion.impl as impl from torch_tensorrt import _enums from torch_tensorrt.dynamo._settings import CompilationSettings from torch_tensorrt.dynamo._SourceIR import SourceIR diff --git a/py/torch_tensorrt/dynamo/conversion/impl/activation/base.py b/py/torch_tensorrt/dynamo/conversion/impl/activation/base.py index db257b9c4e..ed30e2ff18 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/activation/base.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/activation/base.py @@ -1,14 +1,13 @@ from typing import Any, Callable, Optional import tensorrt as trt +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext -from torch_tensorrt.fx.converters.converter_utils import ( - mark_as_int8_layer, +from torch_tensorrt.dynamo.conversion.converter_utils import ( set_layer_name, ) -from torch_tensorrt.fx.types import TRTTensor def convert_activation( @@ -37,11 +36,4 @@ def convert_activation( layer.beta = beta set_layer_name(layer, target, name, source_ir) - if ( - not ctx.net.get_flag(trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED) - and input_val.dynamic_range is not None - and dyn_range_fn is not None - ): - dyn_range = dyn_range_fn(input_val.dynamic_range) - mark_as_int8_layer(layer, dyn_range) return layer.get_output(0) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/activation/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/activation/ops.py index eb981f2031..af47a8e2c9 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/activation/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/activation/ops.py @@ -3,11 +3,11 @@ import numpy as np import tensorrt as trt import torch +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.dynamo.conversion.impl.activation.base import convert_activation -from torch_tensorrt.dynamo.types import TRTTensor def relu( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/addmm.py b/py/torch_tensorrt/dynamo/conversion/impl/addmm.py index 1a0690852a..46ee1f974c 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/addmm.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/addmm.py @@ -2,11 +2,11 @@ import numpy as np import torch +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion import impl from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext -from torch_tensorrt.fx.types import TRTTensor def addmm( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/arange.py b/py/torch_tensorrt/dynamo/conversion/impl/arange.py index baaf690010..7595e97171 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/arange.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/arange.py @@ -2,6 +2,7 @@ import numpy as np import tensorrt as trt +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo.conversion import impl from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -11,7 +12,6 @@ get_trt_tensor, set_layer_name, ) -from torch_tensorrt.fx.types import TRTTensor def arange( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/cast.py b/py/torch_tensorrt/dynamo/conversion/impl/cast.py index 0b69f98fc9..4ad39d4563 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/cast.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/cast.py @@ -4,6 +4,8 @@ import numpy as np import tensorrt as trt import torch +from tensorrt import DataType as TRTDataType +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt import _enums from torch_tensorrt.dynamo._SourceIR import SourceIR @@ -13,7 +15,6 @@ cast_trt_tensor, get_trt_tensor, ) -from torch_tensorrt.fx.types import TRTDataType, TRTTensor LOGGER: logging.Logger = logging.getLogger(__name__) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/cat.py b/py/torch_tensorrt/dynamo/conversion/impl/cat.py index 096bc1aa24..68bbcc31d0 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/cat.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/cat.py @@ -3,6 +3,7 @@ import numpy as np import tensorrt as trt import torch +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt import _enums from torch_tensorrt.dynamo._SourceIR import SourceIR @@ -11,9 +12,8 @@ cast_trt_tensor, get_positive_dim, get_trt_tensor, + set_layer_name, ) -from torch_tensorrt.fx.converters.converter_utils import set_layer_name -from torch_tensorrt.fx.types import TRTTensor def cat( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/condition/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/condition/ops.py index e21e7f32a1..b7739c3b3f 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/condition/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/condition/ops.py @@ -3,6 +3,7 @@ import numpy as np import tensorrt as trt import torch +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -15,7 +16,6 @@ set_layer_name, ) from torch_tensorrt.dynamo.conversion.impl.elementwise import ne -from torch_tensorrt.fx.types import TRTTensor def where( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/conv.py b/py/torch_tensorrt/dynamo/conversion/impl/conv.py index 918c87ca70..513346a63b 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/conv.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/conv.py @@ -5,6 +5,7 @@ # @manual=//deeplearning/trt/python:py_tensorrt import tensorrt as trt import torch +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo.conversion import impl from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -17,11 +18,6 @@ to_torch, to_trt_weights, ) -from torch_tensorrt.fx.converters.converter_utils import ( - get_dyn_range, - mark_as_int8_layer, -) -from torch_tensorrt.fx.types import TRTTensor def convNd( @@ -172,11 +168,6 @@ def convNd( if groups is not None: conv_layer.num_groups = groups - # Handle quantization cases - if scale is not None and zero_point is not None: - # Assume the dtype of activation is torch.quint8 - mark_as_int8_layer(conv_layer, get_dyn_range(scale, zero_point, torch.quint8)) - result = conv_layer.get_output(0) if is_conv1d: diff --git a/py/torch_tensorrt/dynamo/conversion/impl/deconv.py b/py/torch_tensorrt/dynamo/conversion/impl/deconv.py index 6a21415ffe..b9ee582d26 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/deconv.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/deconv.py @@ -5,6 +5,7 @@ # @manual=//deeplearning/trt/python:py_tensorrt import tensorrt as trt import torch +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo.conversion import impl from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -12,15 +13,10 @@ SourceIR, get_trt_tensor, has_dynamic_shape, + set_layer_name, to_torch, to_trt_weights, ) -from torch_tensorrt.fx.converters.converter_utils import ( - get_dyn_range, - mark_as_int8_layer, - set_layer_name, -) -from torch_tensorrt.fx.types import TRTTensor def deconvNd( @@ -174,11 +170,6 @@ def deconvNd( deconv_layer.pre_padding = tuple(pre_padding_values) deconv_layer.post_padding = tuple(post_padding_values) - # Handle quantization cases - if scale is not None and zero_point is not None: - # Assume the dtype of activation is torch.quint8 - mark_as_int8_layer(deconv_layer, get_dyn_range(scale, zero_point, torch.quint8)) - result = deconv_layer.get_output(0) if is_deconv1d: diff --git a/py/torch_tensorrt/dynamo/conversion/impl/dynamic_block_quantize.py b/py/torch_tensorrt/dynamo/conversion/impl/dynamic_block_quantize.py index e935992bda..040828e297 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/dynamic_block_quantize.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/dynamic_block_quantize.py @@ -3,6 +3,7 @@ import numpy as np import tensorrt as trt import torch +from tensorrt import ITensor as TRTTensor from torch.fx.experimental.proxy_tensor import unset_fake_temporarily from torch.fx.node import Target from torch_tensorrt._utils import is_tensorrt_version_supported @@ -10,9 +11,8 @@ from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.dynamo.conversion.converter_utils import ( get_trt_tensor, + set_layer_name, ) -from torch_tensorrt.fx.converters.converter_utils import set_layer_name -from torch_tensorrt.fx.types import TRTTensor if is_tensorrt_version_supported("10.8.0"): diff --git a/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py index 1bfb8c7242..b425973661 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py @@ -4,6 +4,7 @@ import tensorrt as trt import torch import torch_tensorrt.dynamo.conversion.impl as impl +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt import _enums from torch_tensorrt.dynamo._SourceIR import SourceIR @@ -21,7 +22,6 @@ ) from torch_tensorrt.dynamo.conversion.impl.unary import atan, sign from torch_tensorrt.dynamo.conversion.impl.unary.base import convert_unary -from torch_tensorrt.fx.types import TRTTensor def trunc_div( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/embedding.py b/py/torch_tensorrt/dynamo/conversion/impl/embedding.py index 4188c63e30..a712641f44 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/embedding.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/embedding.py @@ -6,6 +6,7 @@ import tensorrt as trt import torch import torch_tensorrt.dynamo.conversion.impl as impl +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -14,10 +15,9 @@ cast_trt_tensor, get_trt_tensor, set_item, + set_layer_name, to_numpy, ) -from torch_tensorrt.fx.converters.converter_utils import set_layer_name -from torch_tensorrt.fx.types import TRTTensor def embedding( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/full.py b/py/torch_tensorrt/dynamo/conversion/impl/full.py index fc079f7f32..5c70d4772f 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/full.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/full.py @@ -3,6 +3,7 @@ import numpy as np import tensorrt as trt import torch +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt import _enums from torch_tensorrt.dynamo.conversion import impl @@ -12,7 +13,6 @@ cast_trt_tensor, get_trt_tensor, ) -from torch_tensorrt.fx.types import TRTTensor def full( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/grid.py b/py/torch_tensorrt/dynamo/conversion/impl/grid.py index 302d286237..00211fb520 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/grid.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/grid.py @@ -1,11 +1,11 @@ from typing import Optional import tensorrt as trt +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.dynamo.conversion.converter_utils import set_layer_name -from torch_tensorrt.dynamo.types import TRTTensor # bilinear, nearest, bicubic GridSamplerInterpolationMode = { diff --git a/py/torch_tensorrt/dynamo/conversion/impl/linear.py b/py/torch_tensorrt/dynamo/conversion/impl/linear.py index 5e859a46d3..3827284950 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/linear.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/linear.py @@ -3,11 +3,12 @@ import numpy as np import tensorrt as trt import torch +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target +from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion import impl from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext -from torch_tensorrt.dynamo.conversion.converter_utils import SourceIR, get_trt_tensor -from torch_tensorrt.dynamo.types import TRTTensor +from torch_tensorrt.dynamo.conversion.converter_utils import get_trt_tensor def linear( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/matmul.py b/py/torch_tensorrt/dynamo/conversion/impl/matmul.py index 83ea3dd99b..65e4f53328 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/matmul.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/matmul.py @@ -2,6 +2,7 @@ import tensorrt as trt import torch +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt import _enums from torch_tensorrt.dynamo._SourceIR import SourceIR @@ -12,7 +13,6 @@ get_trt_tensor, set_layer_name, ) -from torch_tensorrt.dynamo.types import TRTTensor def matrix_multiply( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/nccl_ops.py b/py/torch_tensorrt/dynamo/conversion/impl/nccl_ops.py index c28c5bcc7d..e64c06ca39 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/nccl_ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/nccl_ops.py @@ -5,8 +5,9 @@ import numpy as np import tensorrt as trt from torch.fx.node import Argument, Target +from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext -from torch_tensorrt.fx.converters.converter_utils import SourceIR, set_layer_name +from torch_tensorrt.dynamo.conversion.converter_utils import set_layer_name # class for AllReduce diff --git a/py/torch_tensorrt/dynamo/conversion/impl/pad.py b/py/torch_tensorrt/dynamo/conversion/impl/pad.py index 731058a122..863b6bc218 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/pad.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/pad.py @@ -2,6 +2,7 @@ import numpy as np import tensorrt as trt +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion import impl @@ -11,7 +12,6 @@ set_layer_name, ) from torch_tensorrt.dynamo.conversion.impl.shape import get_shape_with_dynamic_shape -from torch_tensorrt.dynamo.types import TRTTensor """ Note: IPaddingLayer is deprecated in TensorRT 8.2 and will be removed in TensorRT 10.0. diff --git a/py/torch_tensorrt/dynamo/conversion/impl/permutation.py b/py/torch_tensorrt/dynamo/conversion/impl/permutation.py index 1537d0fdbe..60ab762fa6 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/permutation.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/permutation.py @@ -1,6 +1,7 @@ from typing import Optional, Sequence, Union import tensorrt as trt +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion import impl @@ -12,7 +13,6 @@ set_layer_name, ) from torch_tensorrt.dynamo.conversion.impl.shape import get_shape_with_dynamic_shape -from torch_tensorrt.fx.types import TRTTensor def permute( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/pool.py b/py/torch_tensorrt/dynamo/conversion/impl/pool.py index 4e18aaaef2..757f7209d9 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/pool.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/pool.py @@ -3,18 +3,16 @@ import tensorrt as trt import torch_tensorrt.dynamo.conversion.impl as impl +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.dynamo.conversion.converter_utils import ( extend_attr_to_tuple, get_positive_dim, -) -from torch_tensorrt.fx.converters.converter_utils import ( has_dynamic_shape, set_layer_name, ) -from torch_tensorrt.fx.types import TRTTensor def avg_poolNd( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/prelu.py b/py/torch_tensorrt/dynamo/conversion/impl/prelu.py index 166ce16367..8e218f49cb 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/prelu.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/prelu.py @@ -1,10 +1,10 @@ from typing import Optional +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.dynamo.conversion.converter_utils import set_layer_name -from torch_tensorrt.dynamo.types import TRTTensor def prelu( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py index 2aeedb144e..8dd32b11fc 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py @@ -3,14 +3,17 @@ import numpy as np import tensorrt as trt import torch +from tensorrt import ITensor as TRTTensor from torch.fx.experimental.proxy_tensor import unset_fake_temporarily from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion import impl from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext -from torch_tensorrt.dynamo.conversion.converter_utils import get_trt_tensor, to_torch -from torch_tensorrt.fx.converters.converter_utils import set_layer_name -from torch_tensorrt.fx.types import TRTTensor +from torch_tensorrt.dynamo.conversion.converter_utils import ( + get_trt_tensor, + set_layer_name, + to_torch, +) def get_ir(target: Target) -> SourceIR: diff --git a/py/torch_tensorrt/dynamo/conversion/impl/reduce.py b/py/torch_tensorrt/dynamo/conversion/impl/reduce.py index a61a11772d..2bd7d7de36 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/reduce.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/reduce.py @@ -1,6 +1,7 @@ from typing import Optional, Sequence, Tuple, Union import tensorrt as trt +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion import impl @@ -9,9 +10,8 @@ cast_trt_tensor, get_axes_for_reduce_op, get_positive_dim, + set_layer_name, ) -from torch_tensorrt.fx.converters.converter_utils import set_layer_name -from torch_tensorrt.fx.types import TRTTensor def amax( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/select.py b/py/torch_tensorrt/dynamo/conversion/impl/select.py index fe6ade2e68..c4d44a07ea 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/select.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/select.py @@ -4,6 +4,7 @@ import numpy as np import tensorrt as trt import torch +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion import impl @@ -13,16 +14,13 @@ cast_trt_tensor, get_positive_dim, get_trt_tensor, + has_dynamic_shape, + set_layer_name, to_numpy, ) from torch_tensorrt.dynamo.conversion.impl.elementwise import convert_binary_elementwise from torch_tensorrt.dynamo.conversion.impl.shape import shape as get_shape from torch_tensorrt.dynamo.utils import DYNAMIC_DIM -from torch_tensorrt.fx.converters.converter_utils import ( - has_dynamic_shape, - set_layer_name, -) -from torch_tensorrt.fx.types import TRTTensor _LOGGER: logging.Logger = logging.getLogger(__name__) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/shape.py b/py/torch_tensorrt/dynamo/conversion/impl/shape.py index c2dfac802b..27af02e5bb 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/shape.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/shape.py @@ -3,7 +3,9 @@ from typing import List, Optional, Tuple import numpy as np +import tensorrt as trt import torch +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -11,18 +13,15 @@ cast_trt_tensor, get_positive_dim, get_trt_tensor, + set_layer_name, ) from torch_tensorrt.dynamo.conversion.impl.elementwise.base import ( convert_binary_elementwise, ) -from torch_tensorrt.fx.converters.converter_utils import ( +from torch_tensorrt.dynamo.utils import ( Frameworks, - set_layer_name, unified_dtype_converter, ) -from torch_tensorrt.fx.types import TRTTensor - -import tensorrt as trt def shape( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py b/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py index 975480f390..2ef6c740ae 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/shuffle.py @@ -3,6 +3,7 @@ import numpy as np import tensorrt as trt import torch_tensorrt.dynamo.conversion.impl as impl +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt import _enums from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -14,7 +15,6 @@ set_layer_name, ) from torch_tensorrt.dynamo.conversion.impl.shape import get_shape_with_dynamic_shape -from torch_tensorrt.fx.types import TRTTensor def reshape( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/base.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/base.py index a2af840a1f..c2edaceafb 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/base.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/base.py @@ -1,14 +1,15 @@ from typing import Optional +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext -from torch_tensorrt.dynamo.conversion.impl.shape import get_shape_with_dynamic_shape -from torch_tensorrt.fx.converters.converter_utils import ( +from torch_tensorrt.dynamo.conversion.converter_utils import ( has_dynamic_shape, set_layer_name, ) -from torch_tensorrt.fx.types import Shape, TRTTensor +from torch_tensorrt.dynamo.conversion.impl.shape import get_shape_with_dynamic_shape +from torch_tensorrt.dynamo.types import Shape def slice( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index 203bb03553..6a59cfda4c 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -4,8 +4,8 @@ import numpy as np import tensorrt as trt +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target - from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion import impl from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -26,8 +26,8 @@ from torch_tensorrt.dynamo.conversion.impl.shape import get_shape_with_dynamic_shape from torch_tensorrt.dynamo.conversion.impl.shape import shape as get_shape from torch_tensorrt.dynamo.conversion.impl.slice.base import slice +from torch_tensorrt.dynamo.types import Shape from torch_tensorrt.dynamo.utils import DYNAMIC_DIM -from torch_tensorrt.fx.types import Shape, TRTTensor def slice_op( # TODO: This should be slice not whatever is in base diff --git a/py/torch_tensorrt/dynamo/conversion/impl/split.py b/py/torch_tensorrt/dynamo/conversion/impl/split.py index 0f07ceb7ab..143a05a5b2 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/split.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/split.py @@ -1,14 +1,14 @@ from typing import List, Optional, Sequence, Union +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext -from torch_tensorrt.dynamo.conversion.impl.shape import get_shape_with_dynamic_shape -from torch_tensorrt.fx.converters.converter_utils import ( +from torch_tensorrt.dynamo.conversion.converter_utils import ( has_dynamic_shape, set_layer_name, ) -from torch_tensorrt.fx.types import TRTTensor +from torch_tensorrt.dynamo.conversion.impl.shape import get_shape_with_dynamic_shape def split( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/squeeze.py b/py/torch_tensorrt/dynamo/conversion/impl/squeeze.py index dd6a2b9863..371b0a3c72 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/squeeze.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/squeeze.py @@ -1,5 +1,6 @@ from typing import Optional, Sequence, Union +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -7,7 +8,6 @@ get_positive_dim, set_layer_name, ) -from torch_tensorrt.fx.types import TRTTensor def squeeze( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/topk.py b/py/torch_tensorrt/dynamo/conversion/impl/topk.py index 3b6549d285..053a46ce2b 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/topk.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/topk.py @@ -1,6 +1,7 @@ from typing import Optional, Tuple, Union import tensorrt as trt +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion import impl @@ -10,14 +11,13 @@ flatten_dims, get_axes_for_reduce_op, get_positive_dim, - set_layer_name, get_trt_tensor, has_dynamic_shape, + set_layer_name, ) from torch_tensorrt.dynamo.conversion.impl.elementwise import convert_binary_elementwise from torch_tensorrt.dynamo.conversion.impl.shape import shape as get_shape from torch_tensorrt.dynamo.utils import DYNAMIC_DIM -from torch_tensorrt.dynamo.types import TRTTensor def argmax_argmin( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/unary/base.py b/py/torch_tensorrt/dynamo/conversion/impl/unary/base.py index 5da8bad252..51521ceac9 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/unary/base.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/unary/base.py @@ -1,11 +1,11 @@ from typing import Optional import tensorrt as trt +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext -from torch_tensorrt.fx.converters.converter_utils import set_layer_name -from torch_tensorrt.fx.types import TRTTensor +from torch_tensorrt.dynamo.conversion.converter_utils import set_layer_name def convert_unary( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py index 89e490392d..12f6051457 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py @@ -4,16 +4,17 @@ import tensorrt as trt import torch import torch_tensorrt.dynamo.conversion.impl as impl +from tensorrt import DataType as TRTDataType +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.dynamo.conversion.converter_utils import ( cast_trt_tensor, get_trt_tensor, + set_layer_name, ) from torch_tensorrt.dynamo.conversion.impl.unary.base import convert_unary -from torch_tensorrt.fx.converters.converter_utils import set_layer_name -from torch_tensorrt.fx.types import TRTDataType, TRTTensor def exp( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/unsqueeze.py b/py/torch_tensorrt/dynamo/conversion/impl/unsqueeze.py index 35f21198d4..1a54b470f9 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/unsqueeze.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/unsqueeze.py @@ -1,6 +1,7 @@ import logging from typing import List, Optional, Sequence, cast +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -9,7 +10,6 @@ get_trt_tensor, set_layer_name, ) -from torch_tensorrt.dynamo.types import TRTTensor logger = logging.getLogger(__name__) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/upsample.py b/py/torch_tensorrt/dynamo/conversion/impl/upsample.py index 247179455c..4b47ca5dec 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/upsample.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/upsample.py @@ -1,6 +1,7 @@ from typing import Optional, Sequence import tensorrt as trt +from tensorrt import ITensor as TRTTensor from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -9,7 +10,6 @@ set_layer_name, ) from torch_tensorrt.dynamo.conversion.impl.shape import get_shape_with_dynamic_shape -from torch_tensorrt.dynamo.types import TRTTensor def upsample( diff --git a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py index a2feb99d56..9401e3d99d 100644 --- a/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py +++ b/py/torch_tensorrt/dynamo/conversion/ops_evaluators.py @@ -6,6 +6,7 @@ import numpy as np import torch +from tensorrt import ITensor as TRTTensor from torch.fx.node import Argument, Node, Target from torch_tensorrt.dynamo._settings import CompilationSettings from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -13,8 +14,10 @@ ConverterRegistry, dynamo_tensorrt_converter, ) -from torch_tensorrt.fx.types import TRTTensor -from torch_tensorrt.fx.utils import Frameworks, unified_dtype_converter +from torch_tensorrt.dynamo.utils import ( + Frameworks, + unified_dtype_converter, +) _LOGGER: logging.Logger = logging.getLogger(__name__) diff --git a/py/torch_tensorrt/dynamo/conversion/prims_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/prims_ops_converters.py index 923ca9be6c..8f2da209b1 100644 --- a/py/torch_tensorrt/dynamo/conversion/prims_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/prims_ops_converters.py @@ -2,6 +2,7 @@ from typing import Dict, Sequence, Tuple, Union import torch +from tensorrt import ITensor as TRTTensor from torch.fx.node import Argument, Target from torch_tensorrt.dynamo._settings import CompilationSettings from torch_tensorrt.dynamo._SourceIR import SourceIR @@ -10,7 +11,6 @@ from torch_tensorrt.dynamo.conversion._ConverterRegistry import ( dynamo_tensorrt_converter, ) -from torch_tensorrt.fx.types import TRTTensor _LOGGER: logging.Logger = logging.getLogger(__name__) diff --git a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py index b0e41f7aeb..3197d9f7de 100644 --- a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py @@ -95,7 +95,6 @@ def __init__( dla_local_dram_size (int): Host RAM used by DLA to share intermediate tensor data across operations dla_global_dram_size (int): Host RAM used by DLA to store weights and metadata for execution truncate_double (bool): Truncate weights provided in double (float64) to float32 - calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration require_full_compilation (bool): Require modules to be compiled end to end or return an error as opposed to returning a hybrid graph where operations that cannot be run in TensorRT are run in PyTorch min_block_size (int): The minimum number of contiguous TensorRT convertible operations in order to run a set of operations in TensorRT torch_executed_ops (Collection[Target]): Set of aten operators that must be run in PyTorch. An error will be thrown if this set is not empty but ``require_full_compilation`` is True diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index 0703fd1cb9..de736db1bf 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -84,6 +84,37 @@ class Frameworks(Enum): } +def unified_dtype_converter( + dtype: Union[TRTDataType, torch.dtype, np.dtype], to: Frameworks +) -> Union[np.dtype, torch.dtype, TRTDataType]: + """ + Convert TensorRT, Numpy, or Torch data types to any other of those data types. + + Args: + dtype (TRTDataType, torch.dtype, np.dtype): A TensorRT, Numpy, or Torch data type. + to (Frameworks): The framework to convert the data type to. + + Returns: + The equivalent data type in the requested framework. + """ + assert to in Frameworks, f"Expected valid Framework for translation, got {to}" + trt_major_version = int(trt.__version__.split(".")[0]) + if dtype in (np.int8, torch.int8, trt.int8): + return DataTypeEquivalence[trt.int8][to] + elif trt_major_version >= 7 and dtype in (np.bool_, torch.bool, trt.bool): + return DataTypeEquivalence[trt.bool][to] + elif dtype in (np.int32, torch.int32, trt.int32): + return DataTypeEquivalence[trt.int32][to] + elif dtype in (np.int64, torch.int64, trt.int64): + return DataTypeEquivalence[trt.int64][to] + elif dtype in (np.float16, torch.float16, trt.float16): + return DataTypeEquivalence[trt.float16][to] + elif dtype in (np.float32, torch.float32, trt.float32): + return DataTypeEquivalence[trt.float32][to] + else: + raise TypeError("%s is not a supported dtype" % dtype) + + def deallocate_module(module: torch.fx.GraphModule, delete_module: bool = True) -> None: """ This is a helper function to delete the instance of module. We first move it to CPU and then diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py index 6016fe87c5..0d0b12723e 100644 --- a/py/torch_tensorrt/ts/_compile_spec.py +++ b/py/torch_tensorrt/ts/_compile_spec.py @@ -239,9 +239,6 @@ def _parse_compile_spec(compile_spec_: Dict[str, Any]) -> _ts_C.CompileSpec: compile_spec["enabled_precisions"] ) - if "calibrator" in compile_spec and compile_spec["calibrator"]: - info.ptq_calibrator = compile_spec["calibrator"] - if "sparse_weights" in compile_spec: assert isinstance(compile_spec["sparse_weights"], bool) info.sparse_weights = compile_spec["sparse_weights"] @@ -319,7 +316,6 @@ def TensorRTCompileSpec( dla_local_dram_size: int = 1073741824, dla_global_dram_size: int = 536870912, truncate_long_and_double: bool = False, - calibrator: object = None, allow_shape_tensors: bool = False, ) -> torch.classes.tensorrt.CompileSpec: """Utility to create a formatted spec dictionary for using the PyTorch TensorRT backend @@ -354,7 +350,6 @@ def TensorRTCompileSpec( num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels workspace_size (int): Maximum size of workspace given to TensorRT truncate_long_and_double (bool): Truncate weights provided in int64 or double (float64) to int32 and float32 - calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration allow_shape_tensors: (Experimental) Allow aten::size to output shape tensors using IShapeLayer in TensorRT Returns: @@ -378,7 +373,6 @@ def TensorRTCompileSpec( "dla_sram_size": dla_sram_size, # Fast software managed RAM used by DLA to communicate within a layer. "dla_local_dram_size": dla_local_dram_size, # Host RAM used by DLA to share intermediate tensor data across operations "dla_global_dram_size": dla_global_dram_size, # Host RAM used by DLA to store weights and metadata for execution - "calibrator": calibrator, "truncate_long_and_double": truncate_long_and_double, "allow_shape_tensors": allow_shape_tensors, } @@ -433,6 +427,5 @@ def TensorRTCompileSpec( backend_spec._set_dla_global_dram_size(parsed_spec.dla_global_dram_size) backend_spec._set_truncate_long_and_double(parsed_spec.truncate_long_and_double) backend_spec._set_allow_shape_tensors(parsed_spec.allow_shape_tensors) - backend_spec._set_ptq_calibrator(parsed_spec._get_calibrator_handle()) return backend_spec diff --git a/py/torch_tensorrt/ts/_compiler.py b/py/torch_tensorrt/ts/_compiler.py index 114398f010..4bcbf058bc 100644 --- a/py/torch_tensorrt/ts/_compiler.py +++ b/py/torch_tensorrt/ts/_compiler.py @@ -27,7 +27,6 @@ def compile( dla_sram_size: int = 1048576, dla_local_dram_size: int = 1073741824, dla_global_dram_size: int = 536870912, - calibrator: object = None, truncate_long_and_double: bool = False, require_full_compilation: bool = False, min_block_size: int = 3, @@ -92,7 +91,6 @@ def compile( dla_local_dram_size (int): Host RAM used by DLA to share intermediate tensor data across operations dla_global_dram_size (int): Host RAM used by DLA to store weights and metadata for execution truncate_long_and_double (bool): Truncate weights provided in int64 or double (float64) to int32 and float32 - calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration require_full_compilation (bool): Require modules to be compiled end to end or return an error as opposed to returning a hybrid graph where operations that cannot be run in TensorRT are run in PyTorch min_block_size (int): The minimum number of contiguous TensorRT convertible operations in order to run a set of operations in TensorRT torch_executed_ops (List[str]): List of aten operators that must be run in PyTorch. An error will be thrown if this list is not empty but ``require_full_compilation`` is True @@ -147,7 +145,6 @@ def compile( "dla_sram_size": dla_sram_size, "dla_local_dram_size": dla_local_dram_size, "dla_global_dram_size": dla_global_dram_size, - "calibrator": calibrator, "truncate_long_and_double": truncate_long_and_double, "torch_fallback": { "enabled": not require_full_compilation, @@ -182,7 +179,6 @@ def convert_method_to_trt_engine( dla_local_dram_size: int = 1073741824, dla_global_dram_size: int = 536870912, truncate_long_and_double: int = False, - calibrator: object = None, allow_shape_tensors: bool = False, ) -> bytes: """Convert a TorchScript module method to a serialized TensorRT engine @@ -241,7 +237,6 @@ def convert_method_to_trt_engine( dla_local_dram_size (int): Host RAM used by DLA to share intermediate tensor data across operations dla_global_dram_size (int): Host RAM used by DLA to store weights and metadata for execution truncate_long_and_double (bool): Truncate weights provided in int64 or double (float64) to int32 and float32 - calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration allow_shape_tensors: (Experimental) Allow aten::size to output shape tensors using IShapeLayer in TensorRT Returns: @@ -274,7 +269,6 @@ def convert_method_to_trt_engine( "capability": capability, # Restrict kernel selection to safe gpu kernels or safe dla kernels "num_avg_timing_iters": num_avg_timing_iters, # Number of averaging timing iterations used to select kernels "workspace_size": workspace_size, # Maximum size of workspace given to TensorRT - "calibrator": calibrator, "truncate_long_and_double": truncate_long_and_double, "allow_shape_tensors": allow_shape_tensors, } diff --git a/py/torch_tensorrt/ts/ptq.py b/py/torch_tensorrt/ts/ptq.py deleted file mode 100644 index db55aa47e4..0000000000 --- a/py/torch_tensorrt/ts/ptq.py +++ /dev/null @@ -1,233 +0,0 @@ -import sys -from typing import Any, List, Optional - -if sys.version_info >= (3, 11): - from typing import Self -else: - from typing_extensions import Self - -import os -import warnings -from enum import Enum - -import torch -from torch_tensorrt import _C -from torch_tensorrt.ts.logging import Level, log - - -class CalibrationAlgo(Enum): - ENTROPY_CALIBRATION = _C.CalibrationAlgo.ENTROPY_CALIBRATION - ENTROPY_CALIBRATION_2 = _C.CalibrationAlgo.ENTROPY_CALIBRATION_2 - LEGACY_CALIBRATION = _C.CalibrationAlgo.LEGACY_CALIBRATION - MINMAX_CALIBRATION = _C.CalibrationAlgo.MINMAX_CALIBRATION - - -def get_cache_mode_batch(self: object) -> None: - return None - - -def get_batch_size(self: object) -> int: - return 1 - - -def get_batch(self: object, _: Any) -> Optional[List[int]]: - if self.current_batch_idx + self.batch_size > len(self.data_loader.dataset): - return None - - batch = next(self.dataset_iterator) - self.current_batch_idx += self.batch_size - inputs_gpu = [] - if isinstance(batch, list): - for example in batch: - inputs_gpu.append(example.to(self.device).data_ptr()) - else: - inputs_gpu.append(batch.to(self.device).data_ptr()) - return inputs_gpu - - -def read_calibration_cache(self: object) -> bytes: - if self.cache_file and self.use_cache: - if os.path.exists(self.cache_file): - with open(self.cache_file, "rb") as f: - b: bytes = f.read() - return b - else: - raise FileNotFoundError(self.cache_file) - else: - return b"" - - -def write_calibration_cache(self: object, cache: bytes) -> None: - if self.cache_file: - with open(self.cache_file, "wb") as f: - f.write(cache) - else: - return - - -# deepcopy (which involves pickling) is performed on the compile_spec internally during compilation. -# We register this __reduce__ function for pickler to identity the calibrator object returned by DataLoaderCalibrator during deepcopy. -# This should be the object's local name relative to the module https://docs.python.org/3/library/pickle.html#object.__reduce__ -def __reduce__(self: object) -> str: - return self.__class__.__name__ - - -class DataLoaderCalibrator(object): - """ - Constructs a calibrator class in TensorRT and uses pytorch dataloader to load/preprocess - data which is passed during calibration. - - Arguments: - dataloader (torch.utils.data.DataLoader): an instance of pytorch dataloader which iterates through a given dataset. - algo_type (CalibrationAlgo): choice of calibration algorithm. - cache_file (str): path to cache file. - use_cache (bool): flag which enables usage of pre-existing cache. - device (Device): device on which calibration data is copied to. - """ - - def __init__(self, **kwargs: Any): - pass - - def __new__(cls, *args: Any, **kwargs: Any) -> Self: - warnings.warn( - "Int8 PTQ Calibrator has been deprecated by TensorRT, please plan on porting to a NVIDIA Model Optimizer Toolkit based workflow. See: https://pytorch.org/TensorRT/tutorials/_rendered_examples/dynamo/vgg16_ptq.html for more details", - DeprecationWarning, - stacklevel=2, - ) - dataloader = args[0] - algo_type = kwargs.get("algo_type", CalibrationAlgo.ENTROPY_CALIBRATION_2) - cache_file = kwargs.get("cache_file", None) - use_cache = kwargs.get("use_cache", False) - device = kwargs.get("device", torch.device("cuda:0")) - - if not isinstance(dataloader, torch.utils.data.DataLoader): - log( - Level.Error, - "Dataloader : {} is not a valid instance of torch.utils.data.DataLoader".format( - dataloader - ), - ) - - if cache_file: - if use_cache: - log( - Level.Debug, - "Using existing cache_file {} for calibration".format(cache_file), - ) - else: - log(Level.Debug, "Overwriting existing calibration cache file.") - else: - if use_cache: - log( - Level.Warning, - "Input cache file is None but use_cache is set to True in INT8 mode. Ignoring use_cache flag in this run.", - ) - - # Define attributes and member functions for the calibrator class - attribute_mapping = { - "data_loader": dataloader, - "current_batch_idx": 0, - "batch_size": dataloader.batch_size, - "dataset_iterator": iter(dataloader), - "cache_file": cache_file, - "device": device, - "use_cache": use_cache, - "get_batch_size": get_batch_size, - "get_batch": get_cache_mode_batch if use_cache else get_batch, - "read_calibration_cache": read_calibration_cache, - "write_calibration_cache": write_calibration_cache, - "__reduce__": __reduce__, # used when you deepcopy the DataLoaderCalibrator object - } - - # Using type metaclass to construct calibrator class based on algorithm type - if algo_type == CalibrationAlgo.ENTROPY_CALIBRATION: - calib_ec: Self = type( - "Int8EntropyCalibrator", (_C.IInt8EntropyCalibrator,), attribute_mapping - )() - return calib_ec - elif algo_type == CalibrationAlgo.ENTROPY_CALIBRATION_2: - calib_ec2: Self = type( - "Int8EntropyCalibrator2", - (_C.IInt8EntropyCalibrator2,), - attribute_mapping, - )() - return calib_ec2 - elif algo_type == CalibrationAlgo.LEGACY_CALIBRATION: - calib_lc: Self = type( - "Int8LegacyCalibrator", (_C.IInt8LegacyCalibrator,), attribute_mapping - )() - return calib_lc - elif algo_type == CalibrationAlgo.MINMAX_CALIBRATION: - calib_mmc: Self = type( - "Int8MinMaxCalibrator", (_C.IInt8MinMaxCalibrator,), attribute_mapping - )() - return calib_mmc - else: - raise ValueError( - "Invalid calibration algorithm type. Please select among ENTROPY_CALIBRATION, ENTROPY_CALIBRATION, LEGACY_CALIBRATION or MINMAX_CALIBRATION" - ) - - -class CacheCalibrator(object): - """ - Constructs a calibrator class in TensorRT which directly uses pre-existing cache file for calibration. - - Arguments: - cache_file (str): path to cache file. - algo_type (CalibrationAlgo): choice of calibration algorithm. - """ - - def __init__(self, **kwargs: Any): - pass - - def __new__(cls, *args: Any, **kwargs: Any) -> Self: - warnings.warn( - "Int8 PTQ Calibrator has been deprecated by TensorRT, please plan on porting to a NVIDIA Model Optimizer Toolkit based workflow. See: https://pytorch.org/TensorRT/tutorials/_rendered_examples/dynamo/vgg16_ptq.html for more details", - DeprecationWarning, - stacklevel=2, - ) - cache_file = args[0] - algo_type = kwargs.get("algo_type", CalibrationAlgo.ENTROPY_CALIBRATION_2) - - if os.path.isfile(cache_file): - log( - Level.Debug, - "Using existing cache_file {} for calibration".format(cache_file), - ) - else: - log(Level.Error, "Invalid calibration cache file.") - - # Define attributes and member functions for the calibrator class - attribute_mapping = { - "use_cache": True, - "cache_file": cache_file, - "get_batch_size": get_batch_size, - "get_batch": get_cache_mode_batch, - "read_calibration_cache": read_calibration_cache, - "write_calibration_cache": write_calibration_cache, - } - # Using type metaclass to construct calibrator class based on algorithm type - if algo_type == CalibrationAlgo.ENTROPY_CALIBRATION: - calib_ec: Self = type( - "DataLoaderCalibrator", (_C.IInt8EntropyCalibrator,), attribute_mapping - )() - return calib_ec - elif algo_type == CalibrationAlgo.ENTROPY_CALIBRATION_2: - calib_ec2: Self = type( - "DataLoaderCalibrator", (_C.IInt8MinMaxCalibrator,), attribute_mapping - )() - return calib_ec2 - elif algo_type == CalibrationAlgo.LEGACY_CALIBRATION: - calib_lc: Self = type( - "DataLoaderCalibrator", (_C.IInt8LegacyCalibrator,), attribute_mapping - )() - return calib_lc - elif algo_type == CalibrationAlgo.MINMAX_CALIBRATION: - calib_mmc: Self = type( - "DataLoaderCalibrator", (_C.IInt8MinMaxCalibrator,), attribute_mapping - )() - return calib_mmc - else: - raise ValueError( - "Invalid calibration algorithm type. Please select among ENTROPY_CALIBRATION, ENTROPY_CALIBRATION, LEGACY_CALIBRATION or MINMAX_CALIBRATION" - ) diff --git a/tests/py/ts/BUILD b/tests/py/ts/BUILD index 98db68fc44..0da75f7b10 100644 --- a/tests/py/ts/BUILD +++ b/tests/py/ts/BUILD @@ -27,30 +27,6 @@ py_test( ], ) -py_test( - name = "test_ptq_dataloader_calibrator", - srcs = [ - "model_test_case.py", - "test_ptq_dataloader_calibrator.py", - ], - deps = [ - requirement("torchvision"), - ], -) - -# This test is not included in the main test suite by default. This test checks -# if trtorch can use pre-existing trt calibrators already implemented by users. -py_test( - name = "test_ptq_trt_calibrator", - srcs = [ - "model_test_case.py", - "test_ptq_trt_calibrator.py", - ], - deps = [ - requirement("torchvision"), - ], -) - # Following multi_gpu test is only targeted for multi-gpu configurations. It is not included in the test suite by default. py_test( name = "test_multi_gpu", @@ -84,23 +60,3 @@ py_test( requirement("torchvision"), ], ) - -py_test( - name = "test_ptq_to_backend", - srcs = [ - "model_test_case.py", - "test_ptq_to_backend.py", - ], - deps = [ - requirement("torchvision"), - ], -) - -test_suite( - name = "py_calibrator_tests", - tests = [ - ":test_ptq_dataloader_calibrator", - ":test_ptq_to_backend", - ":test_ptq_trt_calibrator", - ], -) diff --git a/tests/py/ts/ptq/test_ptq_dataloader_calibrator.py b/tests/py/ts/ptq/test_ptq_dataloader_calibrator.py deleted file mode 100644 index 9e3068ec3b..0000000000 --- a/tests/py/ts/ptq/test_ptq_dataloader_calibrator.py +++ /dev/null @@ -1,112 +0,0 @@ -import os -import unittest - -import torch -import torch.nn as nn -import torch_tensorrt as torchtrt -import torch_tensorrt.ts.ptq as PTQ -import torchvision -import torchvision.transforms as transforms -from torch.nn import functional as F -from torch_tensorrt.ts.logging import * - - -def find_repo_root(max_depth=10): - dir_path = os.path.dirname(os.path.realpath(__file__)) - for i in range(max_depth): - files = os.listdir(dir_path) - if "MODULE.bazel" in files: - return dir_path - else: - dir_path = os.path.dirname(dir_path) - - raise RuntimeError("Could not find repo root") - - -MODULE_DIR = find_repo_root() + "/tests/modules" - - -def compute_accuracy(testing_dataloader, model): - total = 0 - correct = 0 - loss = 0.0 - class_probs = [] - class_preds = [] - device = torch.device("cuda:0") - with torch.no_grad(): - idx = 0 - for data, labels in testing_dataloader: - data, labels = data.to(device), labels.to(device) - out = model(data) - preds = torch.max(out, 1)[1] - class_probs.append([F.softmax(i, dim=0) for i in out]) - class_preds.append(preds) - total += labels.size(0) - correct += (preds == labels).sum().item() - idx += 1 - - test_probs = torch.cat([torch.stack(batch) for batch in class_probs]) - test_preds = torch.cat(class_preds) - return correct / total - - -@unittest.skipIf( - not torchtrt.ENABLED_FEATURES.torchscript_frontend, - "TorchScript Frontend is not available", -) -class TestAccuracy(unittest.TestCase): - def test_compile_script(self): - self.model = ( - torch.jit.load(MODULE_DIR + "/trained_vgg16.jit.pt").eval().to("cuda") - ) - self.input = torch.randn((1, 3, 32, 32)).to("cuda") - self.testing_dataset = torchvision.datasets.CIFAR10( - root="./data", - train=False, - download=True, - transform=transforms.Compose( - [ - transforms.ToTensor(), - transforms.Normalize( - (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010) - ), - ] - ), - ) - - self.testing_dataloader = torch.utils.data.DataLoader( - self.testing_dataset, batch_size=1, shuffle=False, num_workers=1 - ) - self.calibrator = PTQ.DataLoaderCalibrator( - self.testing_dataloader, - cache_file="./calibration.cache", - use_cache=False, - algo_type=PTQ.CalibrationAlgo.ENTROPY_CALIBRATION_2, - device=torch.device("cuda:0"), - ) - - compile_spec = { - "inputs": [torchtrt.Input([1, 3, 32, 32])], - "enabled_precisions": {torch.float, torch.int8}, - "calibrator": self.calibrator, - "truncate_long_and_double": True, - "device": { - "device_type": torchtrt.DeviceType.GPU, - "gpu_id": 0, - "dla_core": 0, - "allow_gpu_fallback": False, - }, - } - trt_mod = torchtrt.ts.compile(self.model, **compile_spec) - - fp32_test_acc = compute_accuracy(self.testing_dataloader, self.model) - log(Level.Info, "[Pyt FP32] Test Acc: {:.2f}%".format(100 * fp32_test_acc)) - - int8_test_acc = compute_accuracy(self.testing_dataloader, trt_mod) - log(Level.Info, "[TRT INT8] Test Acc: {:.2f}%".format(100 * int8_test_acc)) - acc_diff = fp32_test_acc - int8_test_acc - self.assertTrue(abs(acc_diff) < 3) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/py/ts/ptq/test_ptq_to_backend.py b/tests/py/ts/ptq/test_ptq_to_backend.py deleted file mode 100644 index 015ce97126..0000000000 --- a/tests/py/ts/ptq/test_ptq_to_backend.py +++ /dev/null @@ -1,119 +0,0 @@ -import os -import unittest - -import torch -import torch.nn as nn -import torch_tensorrt as torchtrt -import torchvision -import torchvision.transforms as transforms -import torch_tensorrt.ts.ptq as PTQ -from torch.nn import functional as F -from torch_tensorrt.ts.logging import * - - -def find_repo_root(max_depth=10): - dir_path = os.path.dirname(os.path.realpath(__file__)) - for i in range(max_depth): - files = os.listdir(dir_path) - if "WORKSPACE" in files: - return dir_path - else: - dir_path = os.path.dirname(dir_path) - - raise RuntimeError("Could not find repo root") - - -MODULE_DIR = find_repo_root() + "/tests/modules" - - -def compute_accuracy(testing_dataloader, model): - total = 0 - correct = 0 - loss = 0.0 - class_probs = [] - class_preds = [] - device = torch.device("cuda:0") - with torch.no_grad(): - idx = 0 - for data, labels in testing_dataloader: - data, labels = data.to(device), labels.to(device) - out = model(data) - preds = torch.max(out, 1)[1] - class_probs.append([F.softmax(i, dim=0) for i in out]) - class_preds.append(preds) - total += labels.size(0) - correct += (preds == labels).sum().item() - idx += 1 - - test_probs = torch.cat([torch.stack(batch) for batch in class_probs]) - test_preds = torch.cat(class_preds) - return correct / total - - -@unittest.skipIf( - not torchtrt.ENABLED_FEATURES.torchscript_frontend, - "TorchScript Frontend is not available", -) -class TestAccuracy(unittest.TestCase): - def test_compile_script(self): - self.model = ( - torch.jit.load(MODULE_DIR + "/trained_vgg16.jit.pt").eval().to("cuda") - ) - self.input = torch.randn((1, 3, 32, 32)).to("cuda") - self.testing_dataset = torchvision.datasets.CIFAR10( - root="./data", - train=False, - download=True, - transform=transforms.Compose( - [ - transforms.ToTensor(), - transforms.Normalize( - (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010) - ), - ] - ), - ) - - self.testing_dataloader = torch.utils.data.DataLoader( - self.testing_dataset, batch_size=1, shuffle=False, num_workers=1 - ) - self.calibrator = PTQ.DataLoaderCalibrator( - self.testing_dataloader, - cache_file="./calibration.cache", - use_cache=False, - algo_type=PTQ.CalibrationAlgo.ENTROPY_CALIBRATION_2, - device=torch.device("cuda:0"), - ) - - self.spec = { - "forward": torchtrt.ts.TensorRTCompileSpec( - **{ - "inputs": [torchtrt.Input([1, 3, 32, 32])], - "enabled_precisions": {torch.float, torch.half, torch.int8}, - "calibrator": self.calibrator, - "truncate_long_and_double": True, - "device": { - "device_type": torchtrt.DeviceType.GPU, - "gpu_id": 0, - "dla_core": 0, - "allow_gpu_fallback": False, - }, - } - ) - } - - fp32_test_acc = compute_accuracy(self.testing_dataloader, self.model) - log(Level.Info, "[Pyt FP32] Test Acc: {:.2f}%".format(100 * fp32_test_acc)) - - trt_mod = torch._C._jit_to_backend("tensorrt", self.model, self.spec) - int8_test_acc = compute_accuracy(self.testing_dataloader, trt_mod) - log( - Level.Info, - "[TRT INT8 Backend] Test Acc: {:.2f}%".format(100 * int8_test_acc), - ) - acc_diff = fp32_test_acc - int8_test_acc - self.assertTrue(abs(acc_diff) < 3) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/py/ts/ptq/test_ptq_trt_calibrator.py b/tests/py/ts/ptq/test_ptq_trt_calibrator.py deleted file mode 100644 index bef057081b..0000000000 --- a/tests/py/ts/ptq/test_ptq_trt_calibrator.py +++ /dev/null @@ -1,156 +0,0 @@ -import os -import unittest - -import tensorrt as trt -import torch -import torch.nn as nn -import torch_tensorrt as torchtrt -import torchvision -import torchvision.transforms as transforms -from torch.nn import functional as F -from torch_tensorrt.ts.logging import * - - -def find_repo_root(max_depth=10): - dir_path = os.path.dirname(os.path.realpath(__file__)) - for i in range(max_depth): - files = os.listdir(dir_path) - if "MODULE.bazel" in files: - return dir_path - else: - dir_path = os.path.dirname(dir_path) - - raise RuntimeError("Could not find repo root") - - -MODULE_DIR = find_repo_root() + "/tests/modules" - - -def compute_accuracy(testing_dataloader, model): - total = 0 - correct = 0 - loss = 0.0 - class_probs = [] - class_preds = [] - device = torch.device("cuda:0") - with torch.no_grad(): - idx = 0 - for data, labels in testing_dataloader: - data, labels = data.to(device), labels.to(device) - out = model(data) - preds = torch.max(out, 1)[1] - class_probs.append([F.softmax(i, dim=0) for i in out]) - class_preds.append(preds) - total += labels.size(0) - correct += (preds == labels).sum().item() - idx += 1 - - test_probs = torch.cat([torch.stack(batch) for batch in class_probs]) - test_preds = torch.cat(class_preds) - return correct / total - - -@unittest.skipIf( - not torchtrt.ENABLED_FEATURES.torchscript_frontend, - "TorchScript Frontend is not available", -) -class TRTEntropyCalibrator(trt.IInt8EntropyCalibrator2): - def __init__(self, dataloader, **kwargs): - trt.IInt8EntropyCalibrator2.__init__(self) - - self.cache_file = kwargs.get("cache_file", None) - self.use_cache = kwargs.get("use_cache", False) - self.device = kwargs.get("device", torch.device("cuda:0")) - - self.dataloader = dataloader - self.dataset_iterator = iter(dataloader) - self.batch_size = dataloader.batch_size - self.current_batch_idx = 0 - - def get_batch_size(self): - return 1 - - # TensorRT passes along the names of the engine bindings to the get_batch function. - # You don't necessarily have to use them, but they can be useful to understand the order of - # the inputs. The bindings list is expected to have the same ordering as 'names'. - def get_batch(self, names): - if ( - self.current_batch_idx + self.batch_size - > self.dataloader.dataset.data.shape[0] - ): - return None - - batch = next(self.dataset_iterator) - self.current_batch_idx += self.batch_size - # Treat the first element as input and others as targets. - if isinstance(batch, list): - batch = batch[0].to(self.device) - return [batch.data_ptr()] - - def read_calibration_cache(self): - # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None. - if self.use_cache: - with open(self.cache_file, "rb") as f: - return f.read() - - def write_calibration_cache(self, cache): - if self.cache_file: - with open(self.cache_file, "wb") as f: - f.write(cache) - - -@unittest.skipIf( - not torchtrt.ENABLED_FEATURES.torchscript_frontend, - "TorchScript Frontend is not available", -) -class TestAccuracy(unittest.TestCase): - def test_compile_script(self): - self.model = ( - torch.jit.load(MODULE_DIR + "/trained_vgg16.jit.pt").eval().to("cuda") - ) - self.input = torch.randn((1, 3, 32, 32)).to("cuda") - self.testing_dataset = torchvision.datasets.CIFAR10( - root="./data", - train=False, - download=True, - transform=transforms.Compose( - [ - transforms.ToTensor(), - transforms.Normalize( - (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010) - ), - ] - ), - ) - - self.testing_dataloader = torch.utils.data.DataLoader( - self.testing_dataset, batch_size=1, shuffle=False, num_workers=1 - ) - # Test cases can assume using GPU id: 0 - self.calibrator = TRTEntropyCalibrator(self.testing_dataloader) - - fp32_test_acc = compute_accuracy(self.testing_dataloader, self.model) - log(Level.Info, "[Pyt FP32] Test Acc: {:.2f}%".format(100 * fp32_test_acc)) - - compile_spec = { - "inputs": [torchtrt.Input([1, 3, 32, 32])], - "enabled_precisions": {torch.float, torch.int8}, - "calibrator": self.calibrator, - "truncate_long_and_double": True, - "device": { - "device_type": torchtrt.DeviceType.GPU, - "gpu_id": 0, - "dla_core": 0, - "allow_gpu_fallback": False, - }, - } - - trt_mod = torchtrt.ts.compile(self.model, **compile_spec) - int8_test_acc = compute_accuracy(self.testing_dataloader, trt_mod) - log(Level.Info, "[TRT INT8] Test Acc: {:.2f}%".format(100 * int8_test_acc)) - acc_diff = fp32_test_acc - int8_test_acc - self.assertTrue(abs(acc_diff) < 3) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/py/ts/qat/test_qat_trt_accuracy.py b/tests/py/ts/qat/test_qat_trt_accuracy.py deleted file mode 100644 index ade2cfc865..0000000000 --- a/tests/py/ts/qat/test_qat_trt_accuracy.py +++ /dev/null @@ -1,99 +0,0 @@ -import os -import sys -import unittest - -import torch -import torch.nn as nn -import torch_tensorrt as torchtrt -import torchvision -import torchvision.transforms as transforms -from torch.nn import functional as F -from torch_tensorrt.ts.logging import * - - -def find_repo_root(max_depth=10): - dir_path = os.path.dirname(os.path.realpath(__file__)) - for i in range(max_depth): - files = os.listdir(dir_path) - if "WORKSPACE" in files: - return dir_path - else: - dir_path = os.path.dirname(dir_path) - - raise RuntimeError("Could not find repo root") - - -MODULE_DIR = find_repo_root() + "/tests/modules" - -set_reportable_log_level(Level.Graph) - - -def compute_accuracy(testing_dataloader, model): - total = 0 - correct = 0 - loss = 0.0 - class_probs = [] - class_preds = [] - device = torch.device("cuda:0") - with torch.no_grad(): - idx = 0 - for data, labels in testing_dataloader: - data, labels = data.to(device), labels.to(device) - out = model(data) - preds = torch.max(out, 1)[1] - class_probs.append([F.softmax(i, dim=0) for i in out]) - class_preds.append(preds) - total += labels.size(0) - correct += (preds == labels).sum().item() - idx += 1 - - test_probs = torch.cat([torch.stack(batch) for batch in class_probs]) - test_preds = torch.cat(class_preds) - return correct / total - - -@unittest.skipIf( - not torchtrt.ENABLED_FEATURES.torchscript_frontend, - "TorchScript Frontend is not available", -) -class TestAccuracy(unittest.TestCase): - def test_compile_script(self): - self.model = ( - torch.jit.load(MODULE_DIR + "/trained_vgg16_qat.jit.pt").eval().to("cuda") - ) - self.testing_dataset = torchvision.datasets.CIFAR10( - root="./data", - train=False, - download=True, - transform=transforms.Compose( - [ - transforms.ToTensor(), - transforms.Normalize( - (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010) - ), - ] - ), - ) - - self.testing_dataloader = torch.utils.data.DataLoader( - self.testing_dataset, batch_size=16, shuffle=False, num_workers=1 - ) - - fp32_test_acc = compute_accuracy(self.testing_dataloader, self.model) - log(Level.Info, "[Pyt FP32] Test Acc: {:.2f}%".format(100 * fp32_test_acc)) - - compile_spec = { - "inputs": [torchtrt.Input([16, 3, 32, 32])], - "enabled_precisions": {torch.int8}, - # "enabled_precision": {torch.float32, torch.int8}, - } - - trt_mod = torchtrt.ts.compile(self.model, **compile_spec) - int8_test_acc = compute_accuracy(self.testing_dataloader, trt_mod) - log(Level.Info, "[TRT QAT INT8] Test Acc: {:.2f}%".format(100 * int8_test_acc)) - acc_diff = fp32_test_acc - int8_test_acc - self.assertTrue(abs(acc_diff) < 3) - - -if __name__ == "__main__": - unittest.main()