Skip to content

[Low-bit optim] Improve compile time + Fix PyTorch 2.3 support for 4-bit optim #812

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 4 additions & 11 deletions test/prototype/test_low_bit_optim.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test_quantize_4bit_with_qmap_compile(self, device):


class TestOptim(TestCase):
@pytest.mark.skipif(not TORCH_VERSION_AT_LEAST_2_4, reason="requires PyTorch >= 2.3")
@pytest.mark.skipif(not TORCH_VERSION_AT_LEAST_2_3, reason="requires PyTorch >= 2.3")
@parametrize("optim_name", ["Adam8bit", "AdamW8bit", "Adam4bit", "AdamW4bit", "AdamFp8", "AdamWFp8"])
@parametrize("dtype", [torch.float32, torch.bfloat16])
@parametrize("device", _DEVICES)
Expand All @@ -84,10 +84,7 @@ def test_optim_smoke(self, optim_name, dtype, device):
if not TORCH_VERSION_AT_LEAST_2_4:
pytest.skip("FP8 CUDA requires PyTorch >= 2.4")
if torch.cuda.get_device_capability() < (8, 9):
pytest.skip("FP8 requires compute capability >= 8.9")

# reset cache to avoid hitting cache_size_limit, since the function will re-compile for each test
torch._dynamo.reset_code_caches()
pytest.skip("FP8 CUDA requires compute capability >= 8.9")

model = nn.Sequential(nn.Linear(32, 256), nn.ReLU(), nn.Linear(256, 32))
model.to(device=device, dtype=dtype)
Expand Down Expand Up @@ -232,12 +229,11 @@ def world_size(self) -> int:
return 2

@pytest.mark.skipif(not TORCH_VERSION_AT_LEAST_2_5, reason="OptimState8bit dispatch: attempting to run unimplemented operator/function: aten.as_strided.default")
@pytest.mark.skipif(TORCH_VERSION_AT_LEAST_2_5, reason="https://github.com/pytorch/ao/issues/652")
@skip_if_lt_x_gpu(2)
def test_fsdp2(self):
optim_classes = [low_bit_optim.Adam8bit, low_bit_optim.Adam4bit]
optim_classes = [low_bit_optim.AdamW8bit, low_bit_optim.AdamW4bit]
if torch.cuda.get_device_capability() >= (8, 9):
optim_classes.append(low_bit_optim.AdamFp8)
optim_classes.append(low_bit_optim.AdamWFp8)

self.run_subtests(
{"optim_cls": optim_classes},
Expand All @@ -252,9 +248,6 @@ def _test_fsdp2(self, optim_cls):
TransformerBlock,
)

# seems like cache_size_limit is shared between FSDP processes?
torch._dynamo.config.cache_size_limit = 8 * self.world_size

batch_size = 3
vocab_size = 1024
seq_len = 64
Expand Down
46 changes: 18 additions & 28 deletions torchao/prototype/low_bit_optim/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,45 +27,35 @@ NOTE:
- The low-bit optimizers require PyTorch >= 2.3
- For FP8 optimizers on CUDA, PyTorch >= 2.4 and CUDA compute capability >= 8.9 are required.
- For 4-bit optimizers, we don't implement rank-1 normalization for quantizing 2nd moment as originally done in the paper.
- The first training step is expected to be slow since the optimizer needs to be compiled.

## Benchmarks

Fine-tune [timm](https://github.com/huggingface/pytorch-image-models)'s ViT-H (630M params) on [resisc45](https://huggingface.co/datasets/timm/resisc45) dataset. BF16 AMP, 1 epoch, batch size 8, cosine LR scheduler, 4070Ti SUPER, fixed random seed. Benchmark script is available at [benchmarks/benchmark_low_bit_adam.py](../../../benchmarks/benchmark_low_bit_adam.py).
Fine-tune [timm](https://github.com/huggingface/pytorch-image-models)'s [ViT-H](https://huggingface.co/timm/vit_huge_patch14_224.orig_in21k) (630M params) on [resisc45](https://huggingface.co/datasets/timm/resisc45) dataset. PyTorch 2.4, BF16 AMP, compiled model, 1 epoch, batch size 8, cosine LR scheduler, 4070Ti SUPER, fixed random seed. Benchmark script is available at [benchmarks/benchmark_low_bit_adam.py](../../../benchmarks/benchmark_low_bit_adam.py).

AdamW impl | Max memory (GB) | imgs/s | accuracy
----------------|-----------------|--------|----------
PyTorch (fused) | 12.23 | 41.8 | 94.38
bnb 8-bit | 8.32 | 43.6 | 94.18
ao 8-bit | 8.33 | 42.6 | 94.25
ao FP8 E4M3 | 9.27 | 44.1 | 94.40
lpmm 4-bit | 7.72 | 46.0 | 94.29
ao 4-bit | 7.72 | 40.0 | 94.03
lpmm 4-bit (*) | 7.74 | 26.6 | 94.25
AdamW impl | Peak memory allocated (GB) | imgs/s | accuracy
----------------|----------------------------|--------|----------
PyTorch (fused) | 12.23 | 41.9 | 94.52
bnb 8-bit | 8.32 | 43.6 | 94.54
ao 8-bit | 8.33 | 42.5 | 94.30
ao FP8 E4M3 | 8.33 | 43.2 | 94.13
lpmm 4-bit | 7.72 | 46.1 | 94.40
ao 4-bit | 7.72 | 42.4 | 94.13
lpmm 4-bit (*) | 7.74 | 26.7 | 94.10

(*) means rank-1 normalization is used for 2nd optimizer state. Refer to [paper](https://arxiv.org/abs/2309.01507) for more details.

Fine-tune [Llama2-7B](https://huggingface.co/meta-llama/Llama-2-7b) on [Alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca) dataset. Full BF16, 1 epoch, A100, fixed random seed. Benchmark is done with [torchtune](https://github.com/pytorch/torchtune). See [#746](https://github.com/pytorch/ao/pull/746) for more details.
Fine-tune [Llama2-7B](https://huggingface.co/meta-llama/Llama-2-7b) on [Alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca) dataset. PyTorch 2.4, full BF16, 1 epoch, A100, fixed random seed. Benchmark is done with [torchtune 52d1b838](https://github.com/pytorch/torchtune/tree/52d1b838c1c35b5e75fddf8776be400adc36dff5). See [#812](https://github.com/pytorch/ao/pull/812) for more details.

AdamW impl | Max memory (GB) | toks/s | `truthfulqa_mc2` acc | Compile time
-----------------|-----------------|--------|----------------------|-------------
Not fine-tuned | - | - | 38.95 | -
PyTorch (fused) | 52 | ~4500 | 42.12 | ~4 min
bnb 8-bit | 39 | ~4000 | 41.98 | ~4 min
ao 8-bit | 39 | ~4000 | 42.41 | ~12 min
ao 4-bit | 33 | ~3600 | 42.34 | ~4 min
AdamW impl | Peak memory allocated (GB) | toks/s | `truthfulqa_mc2` acc
-----------------|----------------------------|--------|----------------------
Not fine-tuned | - | - | 38.95
PyTorch (fused) | 51.6 | 3200 | 42.61
bnb 8-bit | 39.3 | 3000 | 42.75
ao 8-bit | 39.1 | 2900 | 41.50
ao 4-bit | 33.2 | 2900 | 42.27

NOTE: lpmm's 4-bit AdamW does not support BF16 weights.

### Note on compile times

There are 2 approaches to compile optimizer step in low-bit optim:

1. Compile optim step for single param i.e. `torch.compile(single_param_adam)`
2. Compile optim step for all params i.e. `torch.compile(param_groups_adam)`

Currently Adam8bit and AdamFp8 use approach (2) (with static shape) since it is faster (but compile much slower), while Adam4bit uses approach (1) (with dynamic shape) since there are excessive memory usage for "Adam4bit + approach (2)". Approach (1) requires dynamic shape to avoid hitting recompiles limit.

## Optimizer CPU offload

This folder also implements optimizer CPU offload (i.e. ZeRO-Offload) for single GPU training. For multi-GPU training, you can use FSDP's built-in CPU offload.
Expand Down
198 changes: 47 additions & 151 deletions torchao/prototype/low_bit_optim/adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,75 +52,63 @@ def _new_buffer(self, p: Tensor, signed: bool):
out = torch.zeros_like(p)
return out

def _prepare_param_groups(self):
param_groups = []

for group in self.param_groups:
_group = []

for p in group["params"]:
if p.grad is None:
continue

grad = p.grad
if grad.is_sparse:
raise RuntimeError("Sparse gradient is not supported")

state = self.state[p]

# State initialization
if len(state) == 0:
state["step"] = torch.tensor(0.0)
state["exp_avg"] = self._new_buffer(p, True)
state["exp_avg_sq"] = self._new_buffer(p, False)
if group["amsgrad"]:
state["max_exp_avg_sq"] = self._new_buffer(p, False)

state["step"] += 1

if not isinstance(group["lr"], Tensor):
raise RuntimeError(
"lr was changed to a non-Tensor object. If you want to update lr, please use "
"optim.param_groups[0]['lr'].fill_(new_lr)"
)

p_grad_state = (
p,
grad,
state["step"],
state["exp_avg"],
state["exp_avg_sq"],
state.get("max_exp_avg_sq", None),
)
_group.append(p_grad_state)

param_groups.append((_group, group["lr"], group["betas"], group["weight_decay"], group["eps"]))

return param_groups

@torch.no_grad()
def step(self, closure=None):
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()

param_groups = self._prepare_param_groups()
# for a given model, the number of different argument combinations to single_param_adam() is fixed.
# thus, it is safe to disable cache limit without the risk of always re-compiling.
with torch._dynamo.utils.disable_cache_limit():
for group in self.param_groups:
for p in group["params"]:
if p.grad is None:
continue

grad = p.grad
if grad.is_sparse:
raise RuntimeError("Sparse gradient is not supported")

state = self.state[p]

# State initialization
if len(state) == 0:
state["step"] = torch.tensor(0.0)
state["exp_avg"] = self._new_buffer(p, True)
state["exp_avg_sq"] = self._new_buffer(p, False)
if group["amsgrad"]:
state["max_exp_avg_sq"] = self._new_buffer(p, False)

state["step"] += 1

if not isinstance(group["lr"], Tensor):
raise RuntimeError(
"lr was changed to a non-Tensor object. If you want to update lr, please use "
"optim.param_groups[0]['lr'].fill_(new_lr)"
)

torch.compile(single_param_adam, fullgraph=True, dynamic=False)(
p,
grad,
state["step"],
state["exp_avg"],
state["exp_avg_sq"],
state.get("max_exp_avg_sq", None),
group["lr"],
group["betas"][0],
group["betas"][1],
group["weight_decay"],
group["eps"],
self.is_adamw,
)

# static compile optim step for all params in a single graph
torch.compile(param_groups_adam, fullgraph=True)(param_groups, self.is_adamw)
return loss


def param_groups_adam(param_groups, is_adamw):
for group, lr, (beta1, beta2), weight_decay, eps in param_groups:
for p, grad, step, exp_avg, exp_avg_sq, max_exp_avg_sq in group:
single_param_adam(
p, grad, step, exp_avg, exp_avg_sq, max_exp_avg_sq, lr, beta1, beta2, weight_decay, eps, is_adamw
)


# this will work with any optim state tensor subclass that implements aten.lerp.Scalar and aten.copy_.default
# and param tensor subclass that implements aten.add_.Tensor, and aten.addcdiv_.default
def single_param_adam(
p: Tensor,
grad: Tensor,
Expand Down Expand Up @@ -198,53 +186,7 @@ def __init__(

@staticmethod
def _subclass_zeros(p: Tensor, signed: bool, block_size: int):
return OptimState4bit.zeros(p.view(-1).shape, signed, block_size, p.device)

@staticmethod
def _unwrap_dtensor(p: Tensor):
return p._local_tensor if isinstance(p, DTensor) else p

@torch.no_grad()
def step(self, closure=None):
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()

param_groups = self._prepare_param_groups()

# NOTE: right now, torch.compile(param_groups_adam) will have excessive memory usage for 4-bit optim.
# thus, as a workaround, we use torch.compile(single_param_adam) and call it for each param.

# NOTE: we have to create flattened optimizer states since torch.compile() will fail otherwise for
# PyTorch 2.3 and 2.4
# calling exp_avg.view(-1) will fail torch.compile(single_param_adam) even if we implement the op
# correctly for the tensor subclass.

# unwrap DTensor since DTensor does not work well with dynamic compile
# flatten p, grad, and optim state to avoid recompilation
for group, lr, (beta1, beta2), weight_decay, eps in param_groups:
for p, grad, step, exp_avg, exp_avg_sq, max_exp_avg_sq in group:
# DTensor._local_tensor has .requires_grad = False
# to avoid recompilation, set p.requires_grad = False and restore it after optim step
p.requires_grad_(False)
torch.compile(single_param_adam, fullgraph=True, dynamic=True)(
self._unwrap_dtensor(p).view(-1),
self._unwrap_dtensor(grad).view(-1),
step,
self._unwrap_dtensor(exp_avg),
self._unwrap_dtensor(exp_avg_sq),
self._unwrap_dtensor(max_exp_avg_sq) if max_exp_avg_sq is not None else None,
lr,
beta1,
beta2,
weight_decay,
eps,
self.is_adamw,
)
p.requires_grad_(True)

return loss
return OptimState4bit.zeros(p.shape, signed, block_size, p.device)


class AdamFp8(_AdamBase):
Expand Down Expand Up @@ -301,53 +243,7 @@ def __init__(

@staticmethod
def _subclass_zeros(p: Tensor, signed: bool, block_size: int):
return OptimState4bit.zeros(p.view(-1).shape, signed, block_size, p.device)

@staticmethod
def _unwrap_dtensor(p: Tensor):
return p._local_tensor if isinstance(p, DTensor) else p

@torch.no_grad()
def step(self, closure=None):
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()

param_groups = self._prepare_param_groups()

# NOTE: right now, torch.compile(param_groups_adam) will have excessive memory usage for 4-bit optim.
# thus, as a workaround, we use torch.compile(single_param_adam) and call it for each param.

# NOTE: we have to create flattened optimizer states since torch.compile() will fail otherwise for
# PyTorch 2.3 and 2.4
# calling exp_avg.view(-1) will fail torch.compile(single_param_adam) even if we implement the op
# correctly for the tensor subclass.

# unwrap DTensor since DTensor does not work well with dynamic compile
# flatten p, grad, and optim state to avoid recompilation
for group, lr, (beta1, beta2), weight_decay, eps in param_groups:
for p, grad, step, exp_avg, exp_avg_sq, max_exp_avg_sq in group:
# DTensor._local_tensor has .requires_grad = False
# to avoid recompilation, set p.requires_grad = False and restore it after optim step
p.requires_grad_(False)
torch.compile(single_param_adam, fullgraph=True, dynamic=True)(
self._unwrap_dtensor(p).view(-1),
self._unwrap_dtensor(grad).view(-1),
step,
self._unwrap_dtensor(exp_avg),
self._unwrap_dtensor(exp_avg_sq),
self._unwrap_dtensor(max_exp_avg_sq) if max_exp_avg_sq is not None else None,
lr,
beta1,
beta2,
weight_decay,
eps,
self.is_adamw,
)
p.requires_grad_(True)

return loss
return OptimState4bit.zeros(p.shape, signed, block_size, p.device)


class AdamWFp8(_AdamBase):
Expand Down
Loading