Address review comments

teja-rao · teja-rao · commit 801399bf529f · 2025-11-03T14:18:39.000-08:00
- Add BSD-style license headers to all new files:
  * batch_invariant_backward.py
  * simple_rl.py
  * tests/test_batch_invariant_backward.py
  * tests/test_exact_determinism.py
  * weights_vllm_compat.py
  * weights/converter.py
  * weights/__init__.py

- Add note about single-device limitation in README.md
  Currently supports single-device training only; future work will
  extend to distributed training with parallelism

- Remove unused imports in simple_rl.py:
  * Remove 'import torchtitan.experiments.compat' (unused)
  * Remove duplicate imports of torchtitan_to_vllm_compat

- Fix all imports to use absolute paths for python -m compatibility:
  * Update model_vllm_compat.py to import from torchtitan.experiments.deterministic_vllm_rl.batch_invariant_backward
  * Update simple_rl.py to import from torchtitan.experiments.deterministic_vllm_rl modules
  * Removes sys.path manipulation - now works cleanly with python -m

- Remove duplicate RMSNormFunction from model_vllm_compat.py:
  * Import rms_norm_with_gradients from batch_invariant_backward.py
  * Remove duplicate RMSNormFunction class and function definition
  * Keeps gradient-enabled operations centralized in utilities module
diff --git a/torchtitan/experiments/deterministic_vllm_rl/README.md b/torchtitan/experiments/deterministic_vllm_rl/README.md
@@ -21,6 +21,8 @@ This experiment solves both problems by:
 - **Gradient Support**: Full backward pass support for training
 - **Model Compatibility**: Drop-in replacement for standard Qwen3 models in TorchTitan
 
+**Note**: This experiment currently supports single-device training only. We plan to extend support for distributed training with tensor parallelism and pipeline parallelism in the future.
+
 ## Architecture
 
 ### Components
@@ -110,8 +112,7 @@ loss.backward()
 Run the complete RL training loop:
 
 ```bash
-cd torchtitan/experiments/deterministic_vllm_rl
-python simple_rl.py
+VLLM_BATCH_INVARIANT=1 VLLM_FLASH_ATTN_VERSION=3 with-proxy python -m torchtitan.experiments.deterministic_vllm_rl.simple_rl
 ```
 
 This will:
diff --git a/torchtitan/experiments/deterministic_vllm_rl/batch_invariant_backward.py b/torchtitan/experiments/deterministic_vllm_rl/batch_invariant_backward.py
@@ -1,3 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 """
 Batch-invariant operations with backward pass support.
 
diff --git a/torchtitan/experiments/deterministic_vllm_rl/models/qwen3/model_vllm_compat.py b/torchtitan/experiments/deterministic_vllm_rl/models/qwen3/model_vllm_compat.py
@@ -8,28 +8,23 @@
 # Uses merged gate_up projections and vLLM Flash Attention
 
 import torch
-import torch.nn.functional as F
 from torch import nn
-from torch.nn.attention.flex_attention import and_masks, BlockMask
 
 from torchtitan.components.tokenizer import BaseTokenizer
-from torchtitan.models.attention import (
-    create_attention_mask,
-    get_causal_mask_mod,
-    get_document_mask_mod,
-)
-from torchtitan.protocols.model import AttentionMasksType
-from torchtitan.protocols.train_spec import ModelProtocol
-
-# Import from local experiment's models
-from ..attention import VLLMCompatibleFlashAttention
 
 # Import from main torchtitan
 from torchtitan.models.qwen3.model.args import Qwen3ModelArgs
+from torchtitan.protocols.model import AttentionMasksType
+from torchtitan.protocols.train_spec import ModelProtocol
 
 # Import vLLM's exact operations for bitwise determinism
 from vllm.model_executor.layers.activation import SiluAndMul as VLLMSiluAndMul
-from vllm.model_executor.layers.batch_invariant import rms_norm as vllm_rms_norm
+
+# Import gradient-enabled operations from experiment utilities
+from torchtitan.experiments.deterministic_vllm_rl.batch_invariant_backward import rms_norm_with_gradients
+
+# Import from local experiment's models
+from ..attention import VLLMCompatibleFlashAttention
 
 
 # RoPE functions (same as original)
@@ -90,84 +85,6 @@ def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
 SiluAndMul = VLLMSiluAndMul
 
 
-class RMSNormFunction(torch.autograd.Function):
-    """
-    Autograd function for RMS normalization using vLLM's Triton kernel in forward
-    and batch-invariant operations in backward.
-    """
-
-    @staticmethod
-    def forward(ctx, input, weight, eps):
-        """
-        Forward pass using vLLM's rms_norm Triton kernel.
-
-        Args:
-            input: Input tensor [*, hidden_size]
-            weight: Weight tensor [hidden_size]
-            eps: Epsilon for numerical stability
-
-        Returns:
-            output: Normalized and scaled tensor [*, hidden_size]
-        """
-        # Use vLLM's Triton kernel for forward (deterministic)
-        output = vllm_rms_norm(input, weight, eps)
-
-        # Save for backward
-        ctx.save_for_backward(input, weight)
-        ctx.eps = eps
-
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        """
-        Backward pass using batch-invariant PyTorch operations.
-
-        Returns:
-            (grad_input, grad_weight, None)
-        """
-        input, weight = ctx.saved_tensors
-        eps = ctx.eps
-
-        # Compute forward pass values needed for backward
-        # variance = mean(x^2) along last dim
-        variance = (input * input).mean(dim=-1, keepdim=True)
-        rms = torch.sqrt(variance + eps)
-        x_norm = input / rms
-
-        # Gradient w.r.t. weight
-        # grad_weight = sum(grad_output * x_norm) over all dims except last
-        grad_weight = (grad_output * x_norm).sum(dim=tuple(range(grad_output.ndim - 1)))
-
-        # Gradient w.r.t. input
-        # grad_x_norm = grad_output * weight
-        grad_x_norm = grad_output * weight
-
-        # grad_x = (grad_x_norm - mean(grad_x_norm * x_norm) * x_norm) / rms
-        mean_term = (grad_x_norm * x_norm).mean(dim=-1, keepdim=True)
-        grad_input = (grad_x_norm - mean_term * x_norm) / rms
-
-        return grad_input, grad_weight, None
-
-
-def rms_norm_with_gradients(input: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6) -> torch.Tensor:
-    """
-    RMS normalization with gradient support.
-
-    Uses vLLM's Triton kernel for forward pass (deterministic) and
-    batch-invariant PyTorch operations for backward pass.
-
-    Args:
-        input: Input tensor [*, hidden_size]
-        weight: Weight tensor [hidden_size]
-        eps: Epsilon for numerical stability
-
-    Returns:
-        output: Normalized and scaled tensor [*, hidden_size]
-    """
-    return RMSNormFunction.apply(input, weight, eps)
-
-
 class VLLMRMSNorm(nn.Module):
     """
     RMSNorm using vLLM's exact Triton kernel for bitwise determinism.
@@ -253,10 +170,14 @@ def __init__(self, model_args: Qwen3ModelArgs):
             self.k_norm = None
 
         # QKV projections
-        self.wq = nn.Linear(model_args.dim, model_args.n_heads * self.head_dim, bias=False)
+        self.wq = nn.Linear(
+            model_args.dim, model_args.n_heads * self.head_dim, bias=False
+        )
         self.wk = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False)
         self.wv = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wo = nn.Linear(model_args.n_heads * self.head_dim, model_args.dim, bias=False)
+        self.wo = nn.Linear(
+            model_args.n_heads * self.head_dim, model_args.dim, bias=False
+        )
 
         # Always use vLLM compatible flash attention
         self.inner_attention = VLLMCompatibleFlashAttention()
@@ -303,7 +224,9 @@ def forward(
         xv = values.transpose(1, 2)
 
         # Apply flash attention (vLLM compatible, no flex attention)
-        assert attention_masks is None, "vLLM compat mode doesn't use flex attention masks"
+        assert (
+            attention_masks is None
+        ), "vLLM compat mode doesn't use flex attention masks"
         output = self.inner_attention(xq, xk, xv, scale=self.scaling)
 
         # Transpose back
diff --git a/torchtitan/experiments/deterministic_vllm_rl/simple_rl.py b/torchtitan/experiments/deterministic_vllm_rl/simple_rl.py
@@ -1,3 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 """
 Simple RL training loop with GRPO-style advantage estimation.
 
@@ -11,19 +17,16 @@
 """
 
 import os
-import tempfile
 import torch
 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoConfig
 from safetensors.torch import load_file, save_file
 from huggingface_hub import snapshot_download
-import numpy as np
 from torch.utils.tensorboard import SummaryWriter
 
-import torchtitan.experiments.compat
 from torchtitan.models.qwen3.model.args import Qwen3ModelArgs
-from weights_vllm_compat import torchtitan_to_vllm_compat, vllm_compat_to_torchtitan
-from weights.converter import torchtitan_to_vllm, vllm_to_torchtitan
+from torchtitan.experiments.deterministic_vllm_rl.weights_vllm_compat import torchtitan_to_vllm_compat, vllm_compat_to_torchtitan
+from torchtitan.experiments.deterministic_vllm_rl.weights.converter import torchtitan_to_vllm, vllm_to_torchtitan
 
 from vllm import LLM, SamplingParams
 from vllm.model_executor.layers.batch_invariant import init_batch_invariance
@@ -618,16 +621,13 @@ def rl_update_step(
         metrics: Dict of training metrics
     """
     # Update vLLM weights from current policy
-    from weights_vllm_compat import torchtitan_to_vllm_compat
     titan_state = model.state_dict()
     vllm_compat_state = torchtitan_to_vllm_compat(titan_state)
     vllm_engine.update_weights(vllm_compat_state)
 
     # Round-trip: load weights back from disk to maintain consistency with vLLM
     import glob
     from safetensors.torch import load_file as sf_load
-    from weights.converter import vllm_to_torchtitan
-    from weights_vllm_compat import torchtitan_to_vllm_compat as titan_to_vllm_compat
 
     shard_files = sorted(glob.glob(os.path.join(vllm_engine.temp_model_dir, "model-*.safetensors")))
     if shard_files:
@@ -642,7 +642,7 @@ def rl_update_step(
 
         if use_vllm_compat:
             # Convert to vLLM-compat format for vLLM-compatible model
-            weights_for_model = titan_to_vllm_compat(titan_from_disk)
+            weights_for_model = torchtitan_to_vllm_compat(titan_from_disk)
         else:
             # Use standard TorchTitan format for standard model
             weights_for_model = titan_from_disk
@@ -776,7 +776,7 @@ def main():
         print("Batch invariance detected - using vLLM-compatible model")
         # Add backward pass support to vLLM's batch_invariant mode
         print("Adding gradient support to vLLM's batch_invariant mode...")
-        from batch_invariant_backward import patch_batch_invariant_with_gradients
+        from torchtitan.experiments.deterministic_vllm_rl.batch_invariant_backward import patch_batch_invariant_with_gradients
         patch_batch_invariant_with_gradients()
     else:
         print("Batch invariance NOT detected - using standard model")
diff --git a/torchtitan/experiments/deterministic_vllm_rl/tests/test_batch_invariant_backward.py b/torchtitan/experiments/deterministic_vllm_rl/tests/test_batch_invariant_backward.py
@@ -1,15 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 """
 Test batch_invariant_backward module to ensure it works correctly.
 """
 
 import torch
-import sys
-from pathlib import Path
-
-# Add current directory to path
-sys.path.insert(0, str(Path(__file__).parent))
 
-from batch_invariant_backward import (
+from torchtitan.experiments.deterministic_vllm_rl.batch_invariant_backward import (
     enable_batch_invariant_backward_mode,
     disable_batch_invariant_backward_mode,
     mm_batch_invariant_backward,
diff --git a/torchtitan/experiments/deterministic_vllm_rl/tests/test_exact_determinism.py b/torchtitan/experiments/deterministic_vllm_rl/tests/test_exact_determinism.py
@@ -1,12 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 """
 Test if batch_invariant operations are EXACTLY deterministic.
 
 This runs the same operation multiple times and checks if results are bit-for-bit identical.
 """
 
 import torch
-from batch_invariant_backward import enable_batch_invariant_backward_mode
-from vllm.model_executor.layers.batch_invariant import disable_batch_invariant_mode, matmul_persistent
+from torchtitan.experiments.deterministic_vllm_rl.batch_invariant_backward import enable_batch_invariant_backward_mode
+from vllm.model_executor.layers.batch_invariant import disable_batch_invariant_mode
 
 print("Enabling batch_invariant_backward mode...")
 disable_batch_invariant_mode()
diff --git a/torchtitan/experiments/deterministic_vllm_rl/weights/__init__.py b/torchtitan/experiments/deterministic_vllm_rl/weights/__init__.py
@@ -1,3 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 """Weight conversion utilities for vLLM and TorchTitan."""
 
 from .converter import vllm_to_torchtitan, torchtitan_to_vllm
diff --git a/torchtitan/experiments/deterministic_vllm_rl/weights/converter.py b/torchtitan/experiments/deterministic_vllm_rl/weights/converter.py
@@ -1,3 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 """
 Minimal weight converter between vLLM and TorchTitan formats for Qwen3-1.7B.
 
diff --git a/torchtitan/experiments/deterministic_vllm_rl/weights_vllm_compat.py b/torchtitan/experiments/deterministic_vllm_rl/weights_vllm_compat.py
@@ -1,3 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 """
 Weight conversion utilities for Qwen3VLLMCompatModel.