diff --git a/notebooks/getting_started_with_fx_path_lower_to_trt.ipynb b/notebooks/getting_started_with_fx_path_lower_to_trt.ipynb index 5ef957fa36..8e480903ab 100644 --- a/notebooks/getting_started_with_fx_path_lower_to_trt.ipynb +++ b/notebooks/getting_started_with_fx_path_lower_to_trt.ipynb @@ -10,14 +10,14 @@ "bento/extensions/theme/main.css": true }, "kernelspec": { - "display_name": "accelerators", + "display_name": "dper3_pytorch (cinder)", "language": "python", - "name": "bento_kernel_accelerators", + "name": "bento_kernel_dper3_pytorch_cinder", "metadata": { - "kernel_name": "bento_kernel_accelerators", - "nightly_builds": true, + "kernel_name": "bento_kernel_dper3_pytorch_cinder", + "nightly_builds": false, "fbpkg_supported": true, - "cinder_runtime": false, + "cinder_runtime": true, "is_prebuilt": true } }, @@ -32,10 +32,10 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3" }, - "last_server_session_id": "c6f6ab3c-9274-41e7-8592-b1b583442e00", - "last_kernel_id": "fcbf3a69-76a4-4730-9b41-bcd0b24729ca", - "last_base_url": "https://devgpu005.ftw6.facebook.com:8093/", - "last_msg_id": "e28f842c-f32dde25c1b80ef7d423dfee_407", + "last_server_session_id": "24a1a10c-29aa-4e2b-a11f-2b5108fc1e58", + "last_kernel_id": "5f014373-151c-4ee8-8939-4daab994d202", + "last_base_url": "https://devgpu005.ftw6.facebook.com:8091/", + "last_msg_id": "687e81e8-4414f32c89cd026dd1ea3fd9_139", "outputWidgetContext": {} }, "nbformat": 4, @@ -58,14 +58,14 @@ { "cell_type": "code", "metadata": { - "originalKey": "7909785f-b9b4-41dd-82af-c144b879df39", + "originalKey": "7db2accc-9fa4-4a1e-8142-d887f2947bcd", "showInput": true, "customInput": null, "collapsed": false, - "requestMsgId": "7db2accc-9fa4-4a1e-8142-d887f2947bcd", + "requestMsgId": "b5d8efce-0963-4074-bc9d-e8e1a78fd424", "customOutput": null, - "executionStartTime": 1656395936225, - "executionStopTime": 1656395937851 + "executionStartTime": 1661189891682, + "executionStopTime": 1661189891856 }, "source": [ "import typing as t\n", @@ -74,10 +74,10 @@ "\n", "import torch\n", "import torchvision\n", - "from torch_tensorrt.fx.lower import lower_to_trt\n", + "from torch_tensorrt.fx.lower import compile\n", "from torch_tensorrt.fx.utils import LowerPrecision" ], - "execution_count": 4, + "execution_count": 9, "outputs": [] }, { @@ -98,16 +98,16 @@ { "cell_type": "code", "metadata": { - "originalKey": "a4455135-8633-4d2d-bdd3-6435a4a9f4dd", + "originalKey": "2835fffa-cc50-479a-9080-c4f7002c0726", "showInput": true, "customInput": null, "code_folding": [], "hidden_ranges": [], "collapsed": false, - "requestMsgId": "2835fffa-cc50-479a-9080-c4f7002c0726", + "requestMsgId": "6ea72dbf-dbfe-451e-8613-15f87e34a1a5", "customOutput": null, - "executionStartTime": 1656398717455, - "executionStopTime": 1656398717662 + "executionStartTime": 1661189260550, + "executionStopTime": 1661189262039 }, "source": [ "@dataclass\n", @@ -159,24 +159,39 @@ " f\"Accuracy: {self.accuracy_res} (rtol={self.conf.accuracy_rtol})\"\n", " )" ], - "execution_count": 22, - "outputs": [] + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "I0822 102740.872 _utils_internal.py:179] NCCL_DEBUG env var is set to None\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "I0822 102740.873 _utils_internal.py:188] NCCL_DEBUG is INFO from /etc/nccl.conf\n" + ] + } + ] }, { "cell_type": "markdown", "metadata": { "originalKey": "3e462cf6-d282-402d-955b-a3ecb400bf0b", - "showInput": true, + "showInput": false, "customInput": null, "code_folding": [], "hidden_ranges": [] }, "source": [ "Run FX path lowering and benchmark the given model according to the specified benchmark configuration. Prints the benchmark result for each configuration at the end of the run. `benchmark_torch_function` is the actual function that computes the fixed number of iterations of functions runs.\n", - "The FX path lowering and TensorRT engine creation is integrated into `low_to_trt()` API which is defined in `fx/lower.py` file.\n", + "The FX path lowering and TensorRT engine creation is integrated into `compile()` API which is defined in `fx/lower.py` file.\n", "It is good to list it out and show the usage of it. It takes in original module, input and lowering setting, run lowering workflow to turn module into a executable TRT engine \n", "```\n", - "def lower_to_trt(\n", + "def compile(\n", " module: nn.Module,\n", " input: ,\n", " max_batch_size: int = 2048,\n", @@ -212,22 +227,18 @@ { "cell_type": "code", "metadata": { - "originalKey": "91333212-7f6d-4bde-a248-44d485e83e5e", + "originalKey": "3002935b-b95a-4a08-a57f-f7a35485af5b", "showInput": true, "customInput": null, "code_folding": [], "hidden_ranges": [], "collapsed": false, - "requestMsgId": "3002935b-b95a-4a08-a57f-f7a35485af5b", + "requestMsgId": "dc73f2d0-427b-4f71-bec1-b118cc5642d0", "customOutput": null, - "executionStartTime": 1656397903207, - "executionStopTime": 1656397964752 + "executionStartTime": 1661189697773, + "executionStopTime": 1661189753875 }, "source": [ - "test_model = torchvision.models.resnet18(pretrained=True)\n", - "input = [torch.rand(128, 3, 224, 224)] \n", - "benchmark(test_model, input, 50, 128)\n", - "\n", "def benchmark_torch_function(iters: int, f, *args) -> float:\n", " \"\"\"Estimates the average time duration for a single inference call in second\n", "\n", @@ -266,7 +277,7 @@ " time = benchmark_torch_function(conf.batch_iter, lambda: module(*input))\n", " elif not conf.jit:\n", " # Run lowering eager mode benchmark\n", - " lowered_module = lower_to_trt(\n", + " lowered_module = compile(\n", " module,\n", " input,\n", " max_batch_size=conf.batch_size,\n", @@ -279,6 +290,7 @@ " result = Result(module=module, input=input, conf=conf, time_sec=time)\n", " return result\n", "\n", + "\n", "@torch.inference_mode()\n", "def benchmark(\n", " model,\n", @@ -315,16 +327,25 @@ " ),\n", " ]\n", "\n", - " results = [\n", - " run_configuration_benchmark(deepcopy(model), inputs, conf_)\n", - " for conf_ in configurations\n", - " ]\n", + " results = [run_configuration_benchmark(deepcopy(model), inputs, conf_) for conf_ in configurations]\n", "\n", " for res in results:\n", - " print(res.format())" + " print(res.format())\n", + "\n", + "\n", + "test_model = torchvision.models.resnet18(pretrained=True)\n", + "input = [torch.rand(128, 3, 224, 224)]\n", + "benchmark(test_model, input, 50, 128)" ], - "execution_count": 21, + "execution_count": 8, "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "I0822 103458.189 manifold.py:1435] URL manifold://torchvision/tree/models/resnet18-f37072fd.pth was already cached in /home/wwei6/.torch/iopath_cache/manifold_cache/tree/models/resnet18-f37072fd.pth\n" + ] + }, { "output_type": "stream", "name": "stdout", @@ -339,25 +360,60 @@ "== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001) green\n" ] }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "I0822 103501.297 pass_utils.py:166] == Log pass before/after graph to /tmp/tmpe_7p37fq\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "I0822 103501.390 pass_utils.py:166] == Log pass before/after graph to /tmp/tmpg_a347f0\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "I0822 103501.509 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "I0822 103501.511 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float32, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n" + ] + }, { "output_type": "stream", "name": "stdout", "text": [ - "== Log pass before/after graph to /tmp/tmpaayayg72\n== Log pass before/after graph to /tmp/tmpdw_pq71j\n\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float32, 'weight': torch.float32})\nacc_ops.batch_norm: ((), {'input': torch.float32, 'running_mean': torch.float32, 'running_var': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\nacc_ops.relu: ((), {'input': torch.float32})\nacc_ops.max_pool2d: ((), {'input': torch.float32})\nacc_ops.add: ((), {'input': torch.float32, 'other': torch.float32})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float32})\nacc_ops.flatten: ((), {'input': torch.float32})\nacc_ops.linear: ((), {'input': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n" + "\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float32, 'weight': torch.float32})\nacc_ops.batch_norm: ((), {'input': torch.float32, 'running_mean': torch.float32, 'running_var': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\nacc_ops.relu: ((), {'input': torch.float32})\nacc_ops.max_pool2d: ((), {'input': torch.float32})\nacc_ops.add: ((), {'input': torch.float32, 'other': torch.float32})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float32})\nacc_ops.flatten: ((), {'input': torch.float32})\nacc_ops.linear: ((), {'input': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ - "I0627 233146.650 fx2trt.py:190] Run Module elapsed time: 0:00:00.244369\n" + "I0822 103503.964 fx2trt.py:204] Run Module elapsed time: 0:00:00.435984\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ - "I0627 233206.570 fx2trt.py:241] Build TRT engine elapsed time: 0:00:19.918630\n" + "I0822 103520.647 fx2trt.py:258] Build TRT engine elapsed time: 0:00:16.681226\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "I0822 103520.658 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:19.147071\n" ] }, { @@ -374,25 +430,60 @@ "== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01) green\n" ] }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "I0822 103523.067 pass_utils.py:166] == Log pass before/after graph to /tmp/tmpgphlicna\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "I0822 103523.106 pass_utils.py:166] == Log pass before/after graph to /tmp/tmpy9cumddi\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "I0822 103523.173 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "I0822 103523.174 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float16, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n" + ] + }, { "output_type": "stream", "name": "stdout", "text": [ - "== Log pass before/after graph to /tmp/tmpnoeblgd5\n== Log pass before/after graph to /tmp/tmpyb1egsof\n\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float16, 'weight': torch.float16})\nacc_ops.batch_norm: ((), {'input': torch.float16, 'running_mean': torch.float16, 'running_var': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\nacc_ops.relu: ((), {'input': torch.float16})\nacc_ops.max_pool2d: ((), {'input': torch.float16})\nacc_ops.add: ((), {'input': torch.float16, 'other': torch.float16})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float16})\nacc_ops.flatten: ((), {'input': torch.float16})\nacc_ops.linear: ((), {'input': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n" + "\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float16, 'weight': torch.float16})\nacc_ops.batch_norm: ((), {'input': torch.float16, 'running_mean': torch.float16, 'running_var': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\nacc_ops.relu: ((), {'input': torch.float16})\nacc_ops.max_pool2d: ((), {'input': torch.float16})\nacc_ops.add: ((), {'input': torch.float16, 'other': torch.float16})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float16})\nacc_ops.flatten: ((), {'input': torch.float16})\nacc_ops.linear: ((), {'input': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "I0822 103523.466 fx2trt.py:204] Run Module elapsed time: 0:00:00.288043\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ - "I0627 233208.996 fx2trt.py:190] Run Module elapsed time: 0:00:00.217076\n" + "I0822 103553.687 fx2trt.py:258] Build TRT engine elapsed time: 0:00:30.220316\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ - "I0627 233244.147 fx2trt.py:241] Build TRT engine elapsed time: 0:00:35.150950\n" + "I0822 103553.698 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:30.523791\n" ] }, { @@ -406,7 +497,7 @@ "output_type": "stream", "name": "stdout", "text": [ - "== End benchmark iterations\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1)\nBS: 128, Time per iter: 15.00ms, QPS: 8530.72, Accuracy: None (rtol=-1)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001)\nBS: 128, Time per iter: 7.95ms, QPS: 16098.45, Accuracy: None (rtol=0.001)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01)\nBS: 128, Time per iter: 4.36ms, QPS: 29365.31, Accuracy: None (rtol=0.01)\n" + "== End benchmark iterations\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1)\nBS: 128, Time per iter: 14.66ms, QPS: 8732.53, Accuracy: None (rtol=-1)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001)\nBS: 128, Time per iter: 7.27ms, QPS: 17595.70, Accuracy: None (rtol=0.001)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01)\nBS: 128, Time per iter: 4.49ms, QPS: 28480.34, Accuracy: None (rtol=0.01)\n" ] } ]