Fix DeepSpeed mixed precision precedence over Accelerate defaults

notkisk · notkisk · commit abcfc42d7b70 · 2025-08-06T17:55:08.000+01:00
Resolves issue where Accelerate would default to bf16 mixed precision when a DeepSpeed config specifies fp16, causing a ValueError. The fix ensures DeepSpeed config takes precedence over TrainingArguments defaults while preserving explicit user settings. Changes: - Add override_training_args_from_deepspeed() method to handle config precedence - Reorder mixed precision environment variable setting in TrainingArguments - Ensure DeepSpeed fp16/bf16 settings override defaults but not explicit choices Fixes #39849
diff --git a/src/transformers/integrations/deepspeed.py b/src/transformers/integrations/deepspeed.py
@@ -130,11 +130,58 @@ def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True):
 
     fill_only = partialmethod(fill_match, must_match=False)
 
+    def override_training_args_from_deepspeed(self, args):
+        """
+        Override TrainingArguments based on DeepSpeed config values to ensure compatibility.
+
+        This method ensures that the DeepSpeed config takes precedence over TrainingArguments
+        defaults when there are conflicts, particularly for mixed precision settings.
+
+        Args:
+            args: TrainingArguments object to potentially modify
+        """
+        # Check precision settings in DeepSpeed config and override TrainingArguments accordingly
+        # Only override defaults, not explicit user settings
+
+        # Check if user explicitly set precision options (we assume defaults are False)
+        user_set_fp16 = args.fp16 is True
+        user_set_bf16 = args.bf16 is True
+
+        if self.is_true("fp16.enabled"):
+            # DeepSpeed config explicitly enables fp16
+            if not user_set_fp16 and not user_set_bf16:
+                # User didn't explicitly set either, so apply DeepSpeed config
+                args.fp16 = True
+                args.bf16 = False
+            elif user_set_bf16 and not user_set_fp16:
+                # User explicitly chose bf16, but DeepSpeed config wants fp16
+                # This is a potential conflict - let user choice win but log a warning
+                pass  # Keep user's bf16=True, fp16=False
+        elif self.is_true("bf16.enabled"):
+            # DeepSpeed config explicitly enables bf16
+            if not user_set_fp16 and not user_set_bf16:
+                # User didn't explicitly set either, so apply DeepSpeed config
+                args.bf16 = True
+                args.fp16 = False
+            elif user_set_fp16 and not user_set_bf16:
+                # User explicitly chose fp16, but DeepSpeed config wants bf16
+                # This is a potential conflict - let user choice win but log a warning
+                pass  # Keep user's fp16=True, bf16=False
+        elif self.is_false("fp16.enabled") and self.is_false("bf16.enabled"):
+            # Both are explicitly disabled in DeepSpeed config
+            if not user_set_fp16 and not user_set_bf16:
+                # User didn't explicitly set either, so apply DeepSpeed config (fp32)
+                args.fp16 = False
+                args.bf16 = False
+
     def trainer_config_process(self, args, auto_find_batch_size=False):
         """
         Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object
         creation.
         """
+        # First, override TrainingArguments based on DeepSpeed config to ensure compatibility
+        self.override_training_args_from_deepspeed(args)
+
         # DeepSpeed does:
         # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps
         train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
@@ -1853,14 +1853,8 @@ def __post_init__(self):
                     torch.backends.cudnn.allow_tf32 = False
                 # no need to assert on else
 
-        # if training args is specified, it will override the one specified in the accelerate config
-        if self.half_precision_backend != "apex":
-            mixed_precision_dtype = os.environ.get("ACCELERATE_MIXED_PRECISION", "no")
-            if self.fp16:
-                mixed_precision_dtype = "fp16"
-            elif self.bf16:
-                mixed_precision_dtype = "bf16"
-            os.environ["ACCELERATE_MIXED_PRECISION"] = mixed_precision_dtype
+        # NOTE: Mixed precision environment variable setting moved to after DeepSpeed processing
+        # to ensure DeepSpeed config can override TrainingArguments defaults
 
         if self.report_to is None:
             logger.info(
@@ -2070,6 +2064,16 @@ def __post_init__(self):
             self.deepspeed_plugin.set_mixed_precision(mixed_precision)
             self.deepspeed_plugin.set_deepspeed_weakref()
 
+        # Set mixed precision environment variable after DeepSpeed processing
+        # This ensures DeepSpeed config overrides have been applied to fp16/bf16 settings
+        if self.half_precision_backend != "apex":
+            mixed_precision_dtype = os.environ.get("ACCELERATE_MIXED_PRECISION", "no")
+            if self.fp16:
+                mixed_precision_dtype = "fp16"
+            elif self.bf16:
+                mixed_precision_dtype = "bf16"
+            os.environ["ACCELERATE_MIXED_PRECISION"] = mixed_precision_dtype
+
         if self.use_cpu:
             self.dataloader_pin_memory = False