Skip to content

Ptq stabilization #72

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/api/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ cc_library(
"src/extra_info.cpp",
"src/logging.cpp",
"src/trtorch.cpp",
"src/ptq.cpp"
],
deps = [
"//core",
Expand Down
54 changes: 26 additions & 28 deletions cpp/api/include/trtorch/ptq.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,21 @@
#include <iostream>
#include <sstream>

#include "trtorch/logging.h"

#ifndef DOXYGEN_SHOULD_SKIP_THIS
namespace nvinfer1 {
class IInt8Calibrator;
class IInt8EntropyCalibrator2;
}

namespace torch {
namespace data {
template<typename Example>
class Iterator;
class Tensor;
}

namespace trtorch {
namespace ptq {
bool get_batch_impl(void* bindings[], const char* names[], int nbBindings, torch::Tensor& data);
}
}
#endif //DOXYGEN_SHOULD_SKIP_THIS
Expand Down Expand Up @@ -45,7 +50,12 @@ class Int8Calibrator : Algorithm {
* @param use_cache : bool - Whether to use the cache (if it exists)
*/
Int8Calibrator(DataLoaderUniquePtr dataloader, const std::string& cache_file_path, bool use_cache)
: dataloader_(dataloader.get()), it_(dataloader_->end()), cache_file_path_(cache_file_path), use_cache_(use_cache) {}
: dataloader_(dataloader.get()), cache_file_path_(cache_file_path), use_cache_(use_cache) {
for (auto batch : *dataloader_) {
batched_data_.push_back(batch.data);
}
it_ = batched_data_.begin();
}

/**
* @brief Get the Batch Size for the next batch (always 1 due to issues with TRT and explicit batch)
Expand All @@ -70,26 +80,15 @@ class Int8Calibrator : Algorithm {
* @return false - There is not a new batch for the calibrator to consume
*/
bool getBatch(void* bindings[], const char* names[], int nbBindings) override {
// HACK: doesnt seem like the first try in the initializer list works
if (! it_created_) {
it_ = dataloader_->begin();
it_created_ = true;
}

if (it_ == dataloader_->end()) {
if (it_ != batched_data_.end()) {
auto status = get_batch_impl(bindings, names, nbBindings, *it_);
it_ = ++it_;
return status;
} else {
// Reset iterator if incase calibrator is going to be used again
it_ = batched_data_.begin();
return false;
}

auto batch = *it_;

for (int i = 0; i < nbBindings; i++) {
auto data = batch.data;
data = data.to(at::kCUDA).contiguous();
bindings[i] = data.data_ptr();
}

it_ = ++it_;
return true;
}

/**
Expand Down Expand Up @@ -151,8 +150,6 @@ class Int8Calibrator : Algorithm {
private:
/// Pointer to the dataloader
DataLoader* dataloader_;
/// Iterator used to traverse the dataloader
torch::data::Iterator<Batch> it_;
/// Path to cache file
const std::string& cache_file_path_;
/// Size of cache
Expand All @@ -161,10 +158,11 @@ class Int8Calibrator : Algorithm {
bool use_cache_;
/// Cache data
std::vector<char> cache_;
/// If the iterator has been created, DataLoaders can only have 1 live iterator,
/// due to some issues this cannot be created at construction, so it is set in the first
/// batch, controlled by this flag
bool it_created_ = false;
/// Batched Data
std::vector<torch::Tensor> batched_data_;
/// Iterator to move through dataset
std::vector<torch::Tensor>::iterator it_;

};

/**
Expand Down
16 changes: 16 additions & 0 deletions cpp/api/src/ptq.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#include "torch/torch.h"
#include "trtorch/ptq.h"

namespace trtorch {
namespace ptq {

bool get_batch_impl(void* bindings[], const char* names[], int nbBindings, torch::Tensor& data) {
for (int i = 0; i < nbBindings; i++) {
data = data.to(at::kCUDA).contiguous();
bindings[i] = data.data_ptr();
}
return true;
}

} // namespace ptq
} // namespace trtorch
6 changes: 4 additions & 2 deletions cpp/benchmark/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Benchmarking

This is a quick benchmarking application for TRTorch. It lets you run supported TorchScript modules both in JIT and TRT and returns the average runtime and throughput.
This is a quick benchmarking application for TRTorch. It lets you run supported TorchScript modules both in JIT and TRT and returns the average runtime and throughput.

## Compilation / Usage

Expand All @@ -20,12 +20,14 @@ bazel run //cpp/benchmark --cxxopt="-DNDEBUG" --cxxopt="-DJIT" --cxxopt="-DTRT"

### Options

You can run a module with JIT or TRT via TRTorch in either FP32 or FP16. These options are controlled by preprocessor directives.
You can run a module with JIT or TRT via TRTorch in either FP32 or FP16. These options are controlled by preprocessor directives.

- To enable JIT profiling, add the argument `--cxxopt="-DJIT"`

- To enable TRT profiling, add the argument `--cxxopt="-DTRT"`

- To enable FP16 execution, add the argument `--cxxopt="-DHALF"`

- To also save the TRT engine, add the argument `--cxxopt="-DSAVE_ENGINE"`

> It's suggested to also define `--cxxopt="-DNDEBUG"` to supress debug information
42 changes: 26 additions & 16 deletions cpp/benchmark/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,6 @@ int main(int argc, const char* argv[]) {

mod.to(at::kCUDA);

#ifdef HALF
mod.to(torch::kHalf);
for (auto layer : mod.named_modules()) {
if (layer.name.find(".bn") != std::string::npos) {
layer.value.to(torch::kFloat);
}
}
#endif

std::vector<std::vector<int64_t>> dims;
for (int i = 2; i < argc; i++) {
auto arg = std::string(argv[i]);
Expand All @@ -129,23 +120,42 @@ int main(int argc, const char* argv[]) {

at::globalContext().setBenchmarkCuDNN(true);

#ifdef JIT
auto jit_runtimes = benchmark_module(mod, dims[0]);
print_avg_std_dev("JIT", jit_runtimes, dims[0][0]);
#endif

#ifdef TRT
auto extra_info = trtorch::ExtraInfo(dims);
extra_info.workspace_size = 1 << 24;
extra_info.workspace_size = 1 << 20;

#ifdef HALF
extra_info.op_precision = at::kHalf;
extra_info.op_precision = torch::kF16;
#endif

auto trt_mod = trtorch::CompileGraph(mod, extra_info);

#ifdef SAVE_ENGINE
std::cout << "Compiling graph to save as TRT engine (/tmp/engine_converted_from_jit.trt)" << std::endl;
auto engine = trtorch::ConvertGraphToTRTEngine(mod, "forward", extra_info);
std::ofstream out("/tmp/engine_converted_from_jit.trt");
out << engine;
out.close();
#endif

auto trt_runtimes = benchmark_module(trt_mod, dims[0]);
print_avg_std_dev("JIT/TRT", trt_runtimes, dims[0][0]);
#endif


#ifdef HALF
mod.to(torch::kHalf);
for (auto layer : mod.named_modules()) {
if (layer.name.find(".bn") != std::string::npos) {
layer.value.to(torch::kFloat);
}
}
#endif

#ifdef JIT
auto jit_runtimes = benchmark_module(mod, dims[0]);
print_avg_std_dev("JIT", jit_runtimes, dims[0][0]);
#endif

std::cout << "ok\n";
}
2 changes: 1 addition & 1 deletion cpp/ptq/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ cc_binary(
name = "ptq",
srcs = [
"main.cpp",
"timer.h"
],
deps = [
"//cpp/ptq/benchmark",
"//cpp/ptq/datasets:cifar10",
"@libtorch//:libtorch",
"@libtorch//:caffe2",
Expand Down
17 changes: 17 additions & 0 deletions cpp/ptq/benchmark/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package(default_visibility = ["//visibility:public"])

cc_library(
name = "benchmark",
hdrs = [
"benchmark.h"
],
srcs = [
"benchmark.cpp",
"timer.h"
],
deps = [
"@libtorch//:libtorch",
"@libtorch//:caffe2",
"//cpp/api:trtorch"
],
)
70 changes: 70 additions & 0 deletions cpp/ptq/benchmark/benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#include "torch/script.h"
#include "torch/torch.h"
#include "ATen/Context.h"
#include "c10/cuda/CUDACachingAllocator.h"
#include "trtorch/trtorch.h"
#include "cuda_runtime_api.h"

#include "timer.h"

#define NUM_WARMUP_RUNS 20
#define NUM_RUNS 100

// Benchmaking code
void print_avg_std_dev(std::string type, std::vector<float>& runtimes, uint64_t batch_size) {
float avg_runtime = std::accumulate(runtimes.begin(), runtimes.end(), 0.0) / runtimes.size();
float fps = (1000.f / avg_runtime) * batch_size;
std::cout << "[" << type << "]: batch_size: " << batch_size << "\n Average latency: " << avg_runtime << " ms\n Average FPS: " << fps << " fps" <<std::endl;

std::vector<float> rt_diff(runtimes.size());
std::transform(runtimes.begin(), runtimes.end(), rt_diff.begin(), [avg_runtime](float x) { return x - avg_runtime; });
float rt_sq_sum = std::inner_product(rt_diff.begin(), rt_diff.end(), rt_diff.begin(), 0.0);
float rt_std_dev = std::sqrt(rt_sq_sum / runtimes.size());

std::vector<float> fps_diff(runtimes.size());
std::transform(runtimes.begin(), runtimes.end(), fps_diff.begin(), [fps, batch_size](float x) { return ((1000.f / x) * batch_size) - fps; });
float fps_sq_sum = std::inner_product(fps_diff.begin(), fps_diff.end(), fps_diff.begin(), 0.0);
float fps_std_dev = std::sqrt(fps_sq_sum / runtimes.size());
std::cout << " Latency Standard Deviation: " << rt_std_dev << "\n FPS Standard Deviation: " << fps_std_dev << "\n(excluding initial warmup runs)" << std::endl;
}

std::vector<float> benchmark_module(torch::jit::script::Module& mod, std::vector<int64_t> shape) {
auto execution_timer = timers::PreciseCPUTimer();
std::vector<float> execution_runtimes;

for (uint64_t i = 0; i < NUM_WARMUP_RUNS; i++) {
std::vector<torch::jit::IValue> inputs_ivalues;
auto in = at::rand(shape, {at::kCUDA});
#ifdef HALF
in = in.to(torch::kHalf);
#endif
inputs_ivalues.push_back(in.clone());

cudaDeviceSynchronize();
mod.forward(inputs_ivalues);
cudaDeviceSynchronize();

}

for (uint64_t i = 0; i < NUM_RUNS; i++) {
std::vector<torch::jit::IValue> inputs_ivalues;
auto in = at::rand(shape, {at::kCUDA});
#ifdef HALF
in = in.to(torch::kHalf);
#endif
inputs_ivalues.push_back(in.clone());
cudaDeviceSynchronize();

execution_timer.start();
mod.forward(inputs_ivalues);
cudaDeviceSynchronize();
execution_timer.stop();

auto time = execution_timer.milliseconds();
execution_timer.reset();
execution_runtimes.push_back(time);

c10::cuda::CUDACachingAllocator::emptyCache();
}
return execution_runtimes;
}
4 changes: 4 additions & 0 deletions cpp/ptq/benchmark/benchmark.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#pragma once

void print_avg_std_dev(std::string type, std::vector<float>& runtimes, uint64_t batch_size);
std::vector<float> benchmark_module(torch::jit::script::Module& mod, std::vector<int64_t> shape);
File renamed without changes.
Loading