Cast input before moving to device for all strategies (#18264)

awaelchli · web-flow · commit 27d9125a5dd7 · 2023-08-09T17:42:55.000+02:00
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -138,6 +138,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Increased the minimum supported `wandb` version for `WandbLogger` from 0.12.0 to 0.12.10 ([#18171](https://github.com/Lightning-AI/lightning/pull/18171))
 
 
+- The input tensors now get cast to the right precision type before transfer to the device ([#18264](https://github.com/Lightning-AI/lightning/pull/18264))
+
+
 ### Deprecated
 
 - Deprecated the `SingleTPUStrategy` (`strategy="single_tpu"`) in favor of `SingleDeviceXLAStrategy` (`strategy="single_xla"`) ([#17383](https://github.com/Lightning-AI/lightning/pull/17383))
diff --git a/src/lightning/pytorch/loops/evaluation_loop.py b/src/lightning/pytorch/loops/evaluation_loop.py
@@ -382,6 +382,7 @@ def _evaluation_step(self, batch: Any, batch_idx: int, dataloader_idx: int) -> N
         """
         trainer = self.trainer
 
+        batch = trainer.precision_plugin.convert_input(batch)
         batch = trainer.lightning_module._on_before_batch_transfer(batch, dataloader_idx=dataloader_idx)
         batch = call._call_strategy_hook(trainer, "batch_to_device", batch, dataloader_idx=dataloader_idx)
 
diff --git a/src/lightning/pytorch/loops/prediction_loop.py b/src/lightning/pytorch/loops/prediction_loop.py
@@ -211,6 +211,7 @@ def _predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None
 
         """
         trainer = self.trainer
+        batch = trainer.precision_plugin.convert_input(batch)
         batch = trainer.lightning_module._on_before_batch_transfer(batch, dataloader_idx=dataloader_idx)
         batch = call._call_strategy_hook(trainer, "batch_to_device", batch, dataloader_idx=dataloader_idx)
 
diff --git a/src/lightning/pytorch/loops/training_epoch_loop.py b/src/lightning/pytorch/loops/training_epoch_loop.py
@@ -194,6 +194,7 @@ def advance(self, data_fetcher: _DataFetcher) -> None:
         self.batch_progress.is_last_batch = data_fetcher.done
 
         trainer = self.trainer
+        batch = trainer.precision_plugin.convert_input(batch)
         batch = trainer.lightning_module._on_before_batch_transfer(batch, dataloader_idx=0)
         batch = call._call_strategy_hook(trainer, "batch_to_device", batch, dataloader_idx=0)
 
diff --git a/src/lightning/pytorch/strategies/deepspeed.py b/src/lightning/pytorch/strategies/deepspeed.py
@@ -895,10 +895,3 @@ def register_strategies(cls, strategy_registry: _StrategyRegistry) -> None:
             offload_params_device="nvme",
             offload_optimizer_device="nvme",
         )
-
-    def batch_to_device(self, batch: Any, device: Optional[torch.device] = None, dataloader_idx: int = 0) -> Any:
-        # The strategy casts the input before moving to the device
-        # In all other strategies, the input gets converted in the `Strategy.*_step` methods
-        # TODO: standardize this for all strategies
-        batch = self.precision_plugin.convert_input(batch)
-        return super().batch_to_device(batch, device, dataloader_idx)
diff --git a/src/lightning/pytorch/strategies/strategy.py b/src/lightning/pytorch/strategies/strategy.py
@@ -372,7 +372,6 @@ def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
         See :meth:`~lightning.pytorch.core.module.LightningModule.training_step` for more details
 
         """
-        args, kwargs = self.precision_plugin.convert_input((args, kwargs))
         assert self.lightning_module is not None
         assert self.model is not None
         with self.precision_plugin.train_step_context():
@@ -394,7 +393,6 @@ def validation_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
         See :meth:`~lightning.pytorch.core.module.LightningModule.validation_step` for more details
 
         """
-        args, kwargs = self.precision_plugin.convert_input((args, kwargs))
         assert self.lightning_module is not None
         assert self.model is not None
         with self.precision_plugin.val_step_context():
@@ -408,7 +406,6 @@ def test_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
         See :meth:`~lightning.pytorch.core.module.LightningModule.test_step` for more details
 
         """
-        args, kwargs = self.precision_plugin.convert_input((args, kwargs))
         assert self.lightning_module is not None
         assert self.model is not None
         with self.precision_plugin.test_step_context():
@@ -422,7 +419,6 @@ def predict_step(self, *args: Any, **kwargs: Any) -> Any:
         See :meth:`~lightning.pytorch.core.module.LightningModule.predict_step` for more details
 
         """
-        args, kwargs = self.precision_plugin.convert_input((args, kwargs))
         assert self.lightning_module is not None
         assert self.model is not None
         with self.precision_plugin.predict_step_context():
diff --git a/tests/tests_pytorch/models/test_gpu.py b/tests/tests_pytorch/models/test_gpu.py
@@ -210,3 +210,23 @@ def to(self, *args, **kwargs):
     with patch.object(batch, "to", wraps=batch.to) as mocked:
         batch = trainer.strategy.batch_to_device(batch, torch.device("cuda:0"))
         mocked.assert_called_with(torch.device("cuda", 0))
+
+
+@RunIf(min_cuda_gpus=1)
+@pytest.mark.parametrize(
+    ("strategy", "precision", "expected_dtype"),
+    [
+        ("auto", "16-mixed", torch.float32),
+        ("auto", "16-true", torch.float16),
+        pytest.param("deepspeed", "bf16-true", torch.bfloat16, marks=RunIf(deepspeed=True, bf16_cuda=True)),
+    ],
+)
+def test_input_tensors_cast_before_transfer_to_device(strategy, precision, expected_dtype):
+    class CustomBoringModel(BoringModel):
+        def transfer_batch_to_device(self, batch, *args, **kwargs):
+            assert batch.dtype == expected_dtype
+            return super().transfer_batch_to_device(batch, *args, **kwargs)
+
+    model = CustomBoringModel()
+    trainer = Trainer(strategy=strategy, devices=1, precision=precision, barebones=True, max_steps=2)
+    trainer.fit(model)
diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py
@@ -1256,22 +1256,6 @@ def configure_optimizers(self):
         trainer.fit(model)
 
 
-@RunIf(min_cuda_gpus=1, deepspeed=True)
-def test_deepspeed_tensors_cast_to_fp16_before_hosted_on_device():
-    class CustomBoringModel(BoringModel):
-        def transfer_batch_to_device(self, batch, *args, **kwargs):
-            assert batch.dtype is torch.float16
-            return super().transfer_batch_to_device(batch, *args, **kwargs)
-
-    model = CustomBoringModel()
-    trainer = Trainer(strategy="deepspeed", devices=1, accelerator="cuda", precision="16-mixed")
-    trainer.strategy.connect(model)
-    batch = torch.zeros(1, dtype=torch.float32)
-    batch = trainer.strategy.batch_to_device(batch)
-    assert batch.is_cuda
-    assert batch.dtype is torch.float16
-
-
 @RunIf(deepspeed=True)
 @pytest.mark.parametrize("device_indices", [[1], [1, 0], [0, 2], [3, 2, 1]])
 def test_validate_parallel_devices_indices(device_indices):