make float8 scaling type configurable (#489)

vkuzo · web-flow · commit b0122375caa7 · 2024-07-30T09:20:14.000-07:00
Summary:

Adds config options to configure float8 scaling type for input, weight,
grad_output.

Performance is not ideal yet, but that's because we have not optimized
it.

Test Plan:

```
// repeat for input, weight, grad_out
with-proxy CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh --training.enable_float8_linear --training.float8_scaling_type_weight delayed --training.compile
```

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -352,8 +352,7 @@ def __init__(self):
             "--training.enable_float8_linear",
             action="store_true",
             help="""
-                If true, swaps `torch.nn.Linear` with `Float8Linear` with
-                default settings (dynamic scaling).
+                If true, swaps `torch.nn.Linear` with `Float8Linear`.
                 This feature requires you to install 'float8_experimental' which can be found
                 here: https://github.com/pytorch-labs/float8_experimental
             """,
@@ -370,6 +369,25 @@ def __init__(self):
             default=False,
             help="Whether precompute float8 scales dynamically for FSDP",
         )
+        self.parser.add_argument(
+            "--training.float8_scaling_type_input",
+            type=str,
+            default="dynamic",
+            help="float8 scaling for input, dynamic (default) or delayed",
+            choices=["dynamic", "delayed"],
+        )
+        self.parser.add_argument(
+            "--training.float8_scaling_type_weight",
+            type=str,
+            default="dynamic",
+            help="float8 scaling for input, dynamic (default) or delayed",
+        )
+        self.parser.add_argument(
+            "--training.float8_scaling_type_grad_output",
+            type=str,
+            default="dynamic",
+            help="float8 scaling for input, dynamic (default) or delayed",
+        )
         self.parser.add_argument(
             "--training.gc_freq",
             type=int,
diff --git a/torchtitan/float8_linear.py b/torchtitan/float8_linear.py
@@ -59,9 +59,19 @@ def maybe_build_fp8_linear(
         enable_fsdp_float8_all_gather = (
             job_config.training.enable_fsdp_float8_all_gather and dp_enabled
         )
+        scaling_type_input = ScalingType(job_config.training.float8_scaling_type_input)
+        scaling_type_weight = ScalingType(
+            job_config.training.float8_scaling_type_weight
+        )
+        scaling_type_grad_output = ScalingType(
+            job_config.training.float8_scaling_type_grad_output
+        )
         float8_config = Float8LinearConfig(
             enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather,
-            cast_config_weight=CastConfig(scaling_type=ScalingType.DYNAMIC),
+            cast_config_input=CastConfig(scaling_type=scaling_type_input),
+            cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+            cast_config_grad_output=CastConfig(scaling_type=scaling_type_grad_output),
+            enable_pre_and_post_forward=False,
         )
         convert_to_float8_training(
             model,
@@ -95,3 +105,34 @@ def maybe_precompute_fp8_dynamic_scale_for_fsdp(
     from float8_experimental import precompute_float8_dynamic_scale_for_fsdp
 
     precompute_float8_dynamic_scale_for_fsdp(model)
+
+
+_sync_float8_amax_and_scale_history = None
+
+
+def maybe_sync_float8_amax_and_scale_history(model: nn.Module, job_config: JobConfig):
+    if not (
+        job_config.training.enable_float8_linear
+        and (
+            job_config.training.float8_scaling_type_input == "delayed"
+            or job_config.training.float8_scaling_type_weight == "delayed"
+            or job_config.training.float8_scaling_type_grad_output == "delayed"
+        )
+    ):
+        return
+
+    from float8_experimental import sync_float8_amax_and_scale_history
+
+    # TODO(future): see if precalculating the modules to sync over is going to
+    # meaningfully help performance
+
+    global _sync_float8_amax_and_scale_history
+    if _sync_float8_amax_and_scale_history is None:
+        if job_config.training.compile:
+            _sync_float8_amax_and_scale_history = torch.compile(
+                sync_float8_amax_and_scale_history
+            )
+        else:
+            _sync_float8_amax_and_scale_history = sync_float8_amax_and_scale_history
+
+    sync_float8_amax_and_scale_history(model)
diff --git a/train.py b/train.py
@@ -30,6 +30,7 @@
 from torchtitan.float8_linear import (
     maybe_build_fp8_linear,
     maybe_precompute_fp8_dynamic_scale_for_fsdp,
+    maybe_sync_float8_amax_and_scale_history,
 )
 from torchtitan.logging_utils import init_logger, logger
 from torchtitan.lr_scheduling import get_lr_schedulers
@@ -417,12 +418,15 @@ def loss_fn(pred, labels):
                     model.parameters(), job_config.training.max_norm, foreach=True
                 )
 
+            # if float8 is enabled, sync float8 amaxes and scales
+            maybe_sync_float8_amax_and_scale_history(model, job_config)
+
             # optimizer step
             checkpoint.wait_for_staging()
             optimizers.step()
             lr_schedulers.step()
 
-            # when fp8 config is on,
+            # when float8 config is on,
             # calculate float8 dynamic amax/scale for all-parameter for FSDP2
             # it issues a single all-reduce for all parameters at once for better performance
             maybe_precompute_fp8_dynamic_scale_for_fsdp(model, job_config)