|
1 |
| -import pytest |
| 1 | +""" |
| 2 | +
|
| 3 | +## Performance Profiling Example |
| 4 | +
|
| 5 | +An minimal version of `gpt-fast generate.py` that demonstrates usage of `torchao.profiler.TransformerPerformanceCounter`. |
| 6 | +- Outputs from gpt-fast are prefixed with GPT-Fast |
| 7 | +- Outputs from `torchao.profiler.TransformerPerformanceCounter` are prefixed with `TransformerPerfCounter`. |
| 8 | +
|
| 9 | +## Usage |
| 10 | +```python |
| 11 | +python perf_profile.py --prompt "Hello my name is" --checkpoint_path path/to/model.pth --num_samples 1 --max_new_tokens 2 --save_path performance_stats.json |
| 12 | +``` |
| 13 | +where `checkpoint_path` is the checkpoint path of the converted model weights per `gpt-fast` and `save_path` specifies where to save performance stats. |
| 14 | +
|
| 15 | +
|
| 16 | +Running the above command for `llama2-7b` should print the following, with accumulated stats saved to `performance_stats.json` |
| 17 | +
|
| 18 | +``` |
| 19 | +Loading model ... |
| 20 | +Time to load model: 20.14 seconds |
| 21 | +
|
| 22 | +============================== |
| 23 | +
|
| 24 | +Using DeviceSpec(device_type=cuda, name=NVIDIA GeForce RTX 3090, dtype=torch.bfloat16, bandwidth=936.1GB/s, flops=35.6TFLOPs, vram=25.4GB) |
| 25 | +Model Config: ModelArgs(block_size=2048, vocab_size=32000, n_layer=32, n_head=32, dim=4096, intermediate_size=11008, n_local_heads=32, head_dim=128, rope_base=10000, norm_eps=1e-05) |
| 26 | +Active params, Total Params: 6607343616, 6738415616 |
| 27 | +
|
| 28 | +============================== |
| 29 | +
|
| 30 | +TransformerPerfCounter Metrics |
| 31 | +PREFILL_SEQLEN-6: |
| 32 | + Latency = 1.26 s |
| 33 | + Tokens |
| 34 | + Total: 6 tokens |
| 35 | + Throughput: 5 tokens/s |
| 36 | + IO |
| 37 | + Total: 13.25 GB |
| 38 | + Throughput: 10.54 GB/s |
| 39 | + Theoretical Latency: 14.15 ms |
| 40 | + FLOPs |
| 41 | + Total: 79.31 GFLOPs |
| 42 | + Throughput: 63.06 GFLOPs/s |
| 43 | + Theoretical Latency: 2.23 ms |
| 44 | + Utilization |
| 45 | + Bandwidth: 0.0113 % |
| 46 | + FLOPs: 0.0018 % |
| 47 | +
|
| 48 | +============================== |
| 49 | +
|
| 50 | +TransformerPerfCounter Metrics |
| 51 | +DECODE_CTX-6_NUM_TOKS-1: |
| 52 | + Latency = 0.16 s |
| 53 | + Tokens |
| 54 | + Total: 1 tokens |
| 55 | + Throughput: 6 tokens/s |
| 56 | + IO |
| 57 | + Total: 13.22 GB |
| 58 | + Throughput: 83.27 GB/s |
| 59 | + Theoretical Latency: 14.13 ms |
| 60 | + FLOPs |
| 61 | + Total: 13.22 GFLOPs |
| 62 | + Throughput: 83.24 GFLOPs/s |
| 63 | + Theoretical Latency: 0.37 ms |
| 64 | + Utilization |
| 65 | + Bandwidth: 0.0890 % |
| 66 | + FLOPs: 0.0023 % |
| 67 | +
|
| 68 | +============================== |
| 69 | +
|
| 70 | +Generated text for sample 0: Hello, my name is [Name |
| 71 | +
|
| 72 | +GPTFast Sample Metrics |
| 73 | + Time for inference 1: 6 prompt tokens 2 tokens generated, 1.57 sec total, 1.28 tokens/sec |
| 74 | + Bandwidth achieved: 17.22 GB/s |
| 75 | +
|
| 76 | +============================== |
| 77 | +
|
| 78 | +GPTFast Aggregate Stats |
| 79 | + Average tokens/sec: 1.28 |
| 80 | + Memory used: 13.51 GB |
| 81 | +
|
| 82 | +============================== |
| 83 | +
|
| 84 | +TransformerPerfCounter |
| 85 | +Performance Summary: |
| 86 | + Latency = 1.42 s |
| 87 | + Tokens |
| 88 | + Total: 7 tokens |
| 89 | + Throughput: 5 tokens/s |
| 90 | + IO |
| 91 | + Total: 26.47 GB |
| 92 | + Throughput: 18.69 GB/s |
| 93 | + Theoretical Latency: 28.28 ms |
| 94 | + FLOPs |
| 95 | + Total: 92.53 GFLOPs |
| 96 | + Throughput: 65.33 GFLOPs/s |
| 97 | + Theoretical Latency: 2.60 ms |
| 98 | + Utilization |
| 99 | + Bandwidth: 0.0200 % |
| 100 | + FLOPs: 0.0018 % |
| 101 | +
|
| 102 | +Saving performance results to performance_stats.json |
| 103 | +``` |
| 104 | +
|
| 105 | +**Notes** |
| 106 | +- The discrepancy between `gpt-fast` token throughput and that of `TransformerPerformanceCounter` is due to the fact that gpt-fast` only counts generated tokens (no prefill) |
| 107 | +-- so even though the `prefill` phase technically generates `len(prompt) + 1` tokens, it counts the number of tokens generated during this phase as `1`, |
| 108 | +whereas `TransformerPerformanceCounter` includes all `prefill` tokens in the total token count. |
| 109 | +""" |
2 | 110 |
|
3 |
| -# Skip if transformers is not installed |
4 |
| -transformers = pytest.importorskip("transformers") |
5 |
| -LlamaConfig = transformers.models.llama.modeling_llama.LlamaConfig |
6 |
| -LlamaForCausalLM = transformers.models.llama.modeling_llama.LlamaForCausalLM |
7 |
| -# import sys |
8 | 111 | import textwrap
|
9 | 112 | import time
|
10 | 113 | from pathlib import Path
|
11 | 114 | from typing import Optional, Tuple, Union
|
12 | 115 |
|
13 | 116 | import torch
|
14 |
| -from model import Transformer |
15 |
| -from tokenizer import get_tokenizer |
16 | 117 | from torch.nn.attention import SDPBackend
|
17 | 118 |
|
| 119 | +from torchao._models.llama.model import Transformer |
| 120 | +from torchao._models.llama.tokenizer import get_tokenizer |
18 | 121 | from torchao.profiler import (
|
19 | 122 | CUDADeviceSpec,
|
20 | 123 | TransformerPerformanceCounter,
|
|
0 commit comments