Skip to content

Commit 0b66ff0

Browse files
authored
[FEAT] Perf Profiler Update (#690)
* add device_spec * add performance counter * add more perf counter tools * add performance counter manager test * add mbu and mfu test * refactor performance manager device spec * add perf stats * start perf counter manager test refactor * add stat print str * refactor performance counter with perf stats * more perf stats tests * add perf stat print formatting tests * fix device spec formatting * finish perf counter manager refactor * add serialization test * refactor stats tests * refactor remaining tests * clean up tests * clean up device_spec tests * add latency * add latency tests * fix formatting * remove unused methods * add documentation * more docs * formatting * clean up warnings * rename duration -> latency * add gpt-fast example * linting and formatting * update profiler tutorial readme * move total_model_params to utils * remove tutorials/profiler
1 parent ed4c405 commit 0b66ff0

File tree

8 files changed

+2230
-0
lines changed

8 files changed

+2230
-0
lines changed

test/profiler/test_device_spec.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import pytest
2+
3+
cuda_driver = pytest.importorskip(
4+
"triton.runtime.driver", reason="requires triton cuda driver module"
5+
)
6+
import itertools
7+
8+
import torch
9+
from utils import patch_device
10+
11+
from torchao.profiler.device_spec import (
12+
_AVAILABLE_GPU_SPECS,
13+
CUDADeviceSpec,
14+
get_chip_name,
15+
)
16+
17+
# -------------------- Device Spec Tests ------------------- #
18+
DEVICE_NAMES = ["h100 sxm", "a100", "nvidia geforce rtx 4090"]
19+
DTYPES = [torch.float32, torch.bfloat16, torch.float16]
20+
USE_TENSORCORES = [True, False]
21+
DEVICE_CONFIGS = itertools.product(DEVICE_NAMES, DTYPES, USE_TENSORCORES)
22+
23+
24+
@pytest.mark.parametrize(
25+
"device_name, dtype, use_tensorcores", DEVICE_CONFIGS, ids=lambda x: str(x)
26+
)
27+
def test_device_spec(device_name, dtype, use_tensorcores):
28+
with patch_device(device_name):
29+
device_spec = CUDADeviceSpec(dtype=dtype, use_tensorcores=use_tensorcores)
30+
if dtype == torch.float32 and use_tensorcores:
31+
dtype = "tfloat32"
32+
chip_name = get_chip_name(device_name)
33+
expected_flops = _AVAILABLE_GPU_SPECS[chip_name][dtype]
34+
assert device_spec.flops_per_s == expected_flops
35+
assert device_spec.flops_by_dtype[dtype] == expected_flops
36+
assert (
37+
device_spec.roofline_balancepoint == expected_flops / device_spec.bandwidth
38+
)
39+
40+
with pytest.raises(AssertionError):
41+
device_spec.flops_per_s = None
42+
print(device_spec.roofline_balancepoint)
43+
# Prevent setting attributes not in named fields to guard against user error
44+
with pytest.raises(AttributeError):
45+
device_spec.FLOPs = None
46+
47+
48+
def test_empty_device_spec():
49+
device_name = "fake device"
50+
with patch_device(device_name):
51+
with pytest.raises(AssertionError):
52+
_ = CUDADeviceSpec()
53+
54+
# Ok to instantiate as long as fields are filled
55+
_ = CUDADeviceSpec(
56+
name=device_name,
57+
flops_per_s=1.0,
58+
bandwidth=1.0,
59+
dtype=torch.float32,
60+
use_tensorcores=True,
61+
)
62+
device_name = DEVICE_NAMES[0]
63+
64+
with patch_device(device_name):
65+
# All critical fields will be auto-filled except for dtype (and vram, but vram is not used for downstream calcs atm)
66+
_ = CUDADeviceSpec(dtype=torch.float32)
67+
68+
# No dtype specified
69+
with pytest.raises(AssertionError):
70+
_ = CUDADeviceSpec()

0 commit comments

Comments
 (0)