fix comments

zewenli98 · zewenli98 · commit 6bd3c7fad1bd · 2025-07-24T15:04:46.000-07:00
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
@@ -10,7 +10,6 @@
 from .constant_folding import constant_fold
 from .fuse_distributed_ops import fuse_distributed_ops
 from .fuse_prims_broadcast import fuse_prims_broadcast
-from .lower_linear import lower_linear
 from .pass_manager import DynamoPassManager
 from .remove_assert_nodes import remove_assert_nodes
 from .remove_detach import remove_detach
@@ -29,7 +28,6 @@
     accumulate_fp32_matmul,
     remove_num_users_is_0_nodes,
     complex_graph_detection,
-    lower_linear,
 ]
 
 pre_lowering_pass_list = [
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/lower_linear.py b/py/torch_tensorrt/dynamo/lowering/passes/lower_linear.py
diff --git a/tools/perf/README.md b/tools/perf/README.md
@@ -9,8 +9,6 @@ This is a comprehensive Python benchmark suite to run perf runs using different
 5. TensorRT
 
 
-Note: Please note that for ONNX models, user can convert the ONNX model to TensorRT serialized engine and then use this package.
-
 ## Prerequisite
 
 Benchmark scripts depends on following Python packages in addition to requirements.txt packages
@@ -47,13 +45,15 @@ Here are the list of `CompileSpec` options that can be provided directly to comp
 * `--backends` : Comma separated string of backends. Eg: torch, torch_compile, dynamo, tensorrt
 * `--model` : Name of the model file (Can be a torchscript module or a tensorrt engine (ending in `.plan` extension)). If the backend is `dynamo` or `torch_compile`, the input should be a Pytorch module (instead of a torchscript module).
 * `--model_torch` : Name of the PyTorch model file (optional, only necessary if `dynamo` or `torch_compile` is a chosen backend)
+* `--onnx` : ONNX model file which helps bypass the step of exporting ONNX from `model_torch`. If this argument is provided, the ONNX will be directly converted to TRT engine
 * `--inputs` : List of input shapes & dtypes. Eg: (1, 3, 224, 224)@fp32 for Resnet or (1, 128)@int32;(1, 128)@int32 for BERT
 * `--batch_size` : Batch size
 * `--precision` : Comma separated list of precisions to build TensorRT engine Eg: fp32,fp16
 * `--device` : Device ID
 * `--truncate` : Truncate long and double weights in the network in Torch-TensorRT
 * `--is_trt_engine` : Boolean flag to be enabled if the model file provided is a TensorRT engine.
 * `--report` : Path of the output file where performance summary is written.
+* `--optimization_level` : Builder optimization level for TensorRT (from 1 to 5, 5 is the highest optimization).
 
 Eg:
 
diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py
@@ -444,10 +444,8 @@ def run_tensorrt(
         if params["onnx"]:
             onnx_path = params["onnx"]
         else:
-            # Export an ONNX model and convert to TRT
             onnx_path = "./onnx-trt.onnx"
-            exp_program = torch.export.export(model.eval().cuda(), tuple(input_tensors))
-            torch.onnx.export(exp_program, tuple(input_tensors), onnx_path)
+            torch.onnx.export(model, tuple(input_tensors), onnx_path, dynamo=True)
         builder = trt.Builder(logger)
         network = builder.create_network(
             1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
@@ -472,6 +470,7 @@ def run_tensorrt(
     print("Running TensorRT for precision: ", precision, " batch_size : ", batch_size)
     iters = params.get("iterations", 20)
 
+    start_time = timeit.default_timer()
     # Get I/O tensor information using TensorRT 10 API
     input_names = []
     output_names = []
@@ -526,16 +525,15 @@ def run_tensorrt(
 
         # Performance measurement
         for i in range(iters):
-            start_time = timeit.default_timer()
             # Wait for current stream to finish
             dedicated_stream.wait_stream(current_stream)
             context.execute_async_v3(dedicated_stream.cuda_stream)
             # Wait for TensorRT stream to finish
             current_stream.wait_stream(dedicated_stream)
             torch.cuda.synchronize()
             end_time = timeit.default_timer()
-            meas_time = end_time - start_time
-            timings.append(meas_time)
+            infer_time = end_time - start_time
+            timings.append(infer_time)
 
     recordStats("TensorRT", timings, precision, batch_size, compile_time_s)
 
diff --git a/tools/perf/requirements.txt b/tools/perf/requirements.txt
@@ -4,6 +4,5 @@ pyyaml
 onnx
 pandas
 transformers
-diffusers==0.21.4
+diffusers
 timm==0.9.8
-