pass deterministic.fill_uninitialized_memory to HF model

3outeille · 3outeille · commit c0c273c5e907 · 2025-11-19T11:25:15.000Z
diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py
@@ -111,8 +111,6 @@ def set_determinism(
         )
         torch.backends.cudnn.deterministic = True
         torch.backends.cudnn.benchmark = False
-        # Otherwise, Huggignface modeling register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan
-        torch.utils.deterministic.fill_uninitialized_memory = False
         # env var for deterministic CuBLAS
         # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html
         os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
@@ -171,6 +171,8 @@ def update_from_config(self, job_config: JobConfig):
 
         self.max_seq_len = job_config.training.seq_len
 
+        self.deterministic = job_config.debug.deterministic
+
         # Configure HF-specific settings to match TorchTitan settings
         # TODO: false ?
         self.attention_bias = False
diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py
@@ -50,6 +50,10 @@ class HFTransformerModel(nn.Module):
     def __init__(self, model_args: HFTransformerModelArgs):
         super().__init__()
 
+        #NOTE(3outeille): This prevents Hugging Face modeling from initializing ROPE (inv_freq) buffers to NaN. Usefull when loading from seed checkpoint.
+        if hasattr(model_args, 'deterministic') and model_args.deterministic:
+            torch.utils.deterministic.fill_uninitialized_memory = False
+
         # Try to import the model class dynamically from the transformers library if not found in globals
         model_class_name = model_args.architectures[0]
         model_cls = globals().get(model_class_name, None)

Original file line number	Diff line number	Diff line change
`@@ -111,8 +111,6 @@ def set_determinism(`
`111`	`111`	`)`
`112`	`112`	`torch.backends.cudnn.deterministic = True`
`113`	`113`	`torch.backends.cudnn.benchmark = False`
`114`		`- # Otherwise, Huggignface modeling register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan`
`115`		`- torch.utils.deterministic.fill_uninitialized_memory = False`
`116`	`114`	`# env var for deterministic CuBLAS`
`117`	`115`	`# https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html`
`118`	`116`	`os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"`