@@ -78,6 +78,10 @@ def run_torch_tensorrt(model, input_tensors, params, precision):
7878 "inputs" : input_tensors ,
7979 "enabled_precisions" : {precision_to_dtype (precision )}
8080 }
81+
82+ if precision == 'int8' :
83+ compile_settings .update ({"calib" : params .get ('calibration_cache' )})
84+
8185
8286 model = torchtrt .compile (model , ** compile_settings )
8387
@@ -166,26 +170,35 @@ def run_tensorrt(model, input_tensors, params, precision, is_trt_engine=False):
166170 k += 1
167171
168172 timings = []
169- with torch .no_grad ():
170- with engine .create_execution_context () as context :
171- for i in range (WARMUP_ITER ):
172- context .execute_async (batch_size , bindings , torch .cuda .current_stream ().cuda_stream )
173- torch .cuda .synchronize ()
174-
175- for i in range (iters ):
176- start_time = timeit .default_timer ()
177- context .execute_async (batch_size , bindings , torch .cuda .current_stream ().cuda_stream )
178- torch .cuda .synchronize ()
179- end_time = timeit .default_timer ()
180- meas_time = end_time - start_time
181- timings .append (meas_time )
182- print ("Iterations {}: {:.6f} s" .format (i , end_time - start_time ))
173+ with engine .create_execution_context () as context :
174+ for i in range (WARMUP_ITER ):
175+ context .execute_async (batch_size , bindings , torch .cuda .current_stream ().cuda_stream )
176+ torch .cuda .synchronize ()
177+
178+ for i in range (iters ):
179+ start_time = timeit .default_timer ()
180+ context .execute_async (batch_size , bindings , torch .cuda .current_stream ().cuda_stream )
181+ torch .cuda .synchronize ()
182+ end_time = timeit .default_timer ()
183+ meas_time = end_time - start_time
184+ timings .append (meas_time )
185+ print ("Iterations {}: {:.6f} s" .format (i , end_time - start_time ))
183186
184187 printStats ("TensorRT" , timings , precision )
185188
186189# Deploys inference run for different backend configurations
187190def run (model , input_tensors , params , precision , is_trt_engine = False ):
188191 for backend in params .get ('backend' ):
192+
193+ if precision == 'int8' :
194+ if backend == 'all' or backend == 'torch' :
195+ print ("int8 precision is not supported for torch runtime in this script yet" )
196+ return False
197+
198+ if backend == 'all' or backend == 'torch_tensorrt' or params .get ('calibration_cache' , None ) == None :
199+ print ("int8 precision expects calibration cache file for inference" )
200+ return False
201+
189202 if backend == 'all' :
190203 run_torch (model , input_tensors , params , precision )
191204 run_torch_tensorrt (model , input_tensors , params , precision )
@@ -280,20 +293,25 @@ def load_model(params):
280293 # Create random input tensor of certain size
281294 torch .manual_seed (12345 )
282295
283- num_input = params .get ('input' ).get ('num_of_input ' )
296+ num_input = params .get ('input' ).get ('num_inputs ' )
284297 for precision in params .get ('runtime' ).get ('precision' , 'fp32' ):
285298 input_tensors = []
286- num_input = params .get ('input' ).get ('num_of_input ' , 1 )
299+ num_input = params .get ('input' ).get ('num_inputs ' , 1 )
287300 for i in range (num_input ):
288301 inp_tensor = params .get ('input' ).get ('input' + str (i ))
289302 input_tensors .append (torch .randint (0 , 2 , tuple (d for d in inp_tensor ), dtype = precision_to_dtype (precision )).cuda ())
290303
304+ if is_trt_engine :
305+ print ("Warning, TensorRT engine file is configured. Please make sure the precision matches with the TRT engine for reliable results" )
306+
291307 if not is_trt_engine and precision == "fp16" or precision == "half" :
292308 # If model is TensorRT serialized engine then model.half will report failure
293309 model = model .half ()
294310
295311 # Run inference
296- run (model , input_tensors , params , precision , is_trt_engine )
312+ status = run (model , input_tensors , params , precision , is_trt_engine )
313+ if status == False :
314+ continue
297315
298316 # Generate report
299317 print ('Model Summary:' )
0 commit comments