collapse batch outside of dataloader

liangel-02 · liangel-02 · commit 55352a5c6160 · 2025-11-14T10:09:22.000-08:00
diff --git a/torchtitan/hf_datasets/text_datasets.py b/torchtitan/hf_datasets/text_datasets.py
@@ -18,7 +18,6 @@
 from torchtitan.components.tokenizer import BaseTokenizer
 from torchtitan.config import JobConfig
 from torchtitan.hf_datasets import DatasetConfig
-from torchtitan.protocols import train_spec
 from torchtitan.tools.logging import logger
 
 
@@ -68,63 +67,6 @@ def _validate_dataset(
     return path, config.loader, config.sample_processor
 
 
-def varlen_collate_fn(batch):
-    """
-    Custom collate function for variable length attention
-    Collapses batch dimension by packing all samples into one sequence
-
-    Args:
-        batch: List of (input_dict, label) tuples
-
-    Returns:
-        packed (input_dict, label) with collapsed batch dimension
-    """
-    if len(batch) == 1:
-        input_dict, label = batch[0]
-        return {
-            "input": input_dict["input"].unsqueeze(0),  # [1, seq_len]
-            "cu_seq_q": input_dict["cu_seq_q"],
-            "cu_seq_k": input_dict["cu_seq_k"],
-            "max_q": input_dict["max_q"],
-            "max_k": input_dict["max_k"],
-        }, label.unsqueeze(
-            0
-        )  # [1, seq_len]
-
-    inputs = []
-    labels = []
-    cu_seqlens_list = []
-    offset = 0
-    max_seqlen = 0
-
-    for input_dict, label in batch:
-        inputs.append(input_dict["input"])
-        labels.append(label)
-
-        cu_seqlens = input_dict["cu_seq_q"]
-        cu_seqlens_adjusted = cu_seqlens[:-1] + offset
-        cu_seqlens_list.append(cu_seqlens_adjusted)
-
-        max_seqlen = max(max_seqlen, input_dict["max_q"])
-
-        offset += len(input_dict["input"])
-
-    packed_input = torch.cat(inputs, dim=0).unsqueeze(0)  # shape: [1, total_tokens]
-    packed_label = torch.cat(labels, dim=0).unsqueeze(0)  # shape: [1, total_tokens]
-
-    packed_cu_seqlens = torch.cat(
-        cu_seqlens_list + [torch.tensor([offset], dtype=torch.int32)]
-    )
-
-    return {
-        "input": packed_input,
-        "cu_seq_q": packed_cu_seqlens,
-        "cu_seq_k": packed_cu_seqlens,
-        "max_q": max_seqlen,
-        "max_k": max_seqlen,
-    }, packed_label
-
-
 class HuggingFaceTextDataset(IterableDataset, Stateful):
     def __init__(
         self,
@@ -155,9 +97,6 @@ def __init__(
         self._sample_idx = 0
         self._token_buffer: list[int] = []
 
-        self._boundary_buffer: list[int] = [0]
-        self.use_varlen_attn: bool = False
-
     def _get_data_iter(self):
         # For map-style datasets, resume by skipping to the correct index
         # For iterable-style datasets, the underlying iterator already points to the correct index
@@ -182,63 +121,13 @@ def __iter__(self):
                 self._token_buffer.extend(sample_tokens)
                 self._sample_idx += 1
 
-                if self.use_varlen_attn:
-                    self._boundary_buffer.append(len(self._token_buffer))
-
                 while len(self._token_buffer) >= max_buffer_token_len:
                     x = torch.LongTensor(self._token_buffer[:max_buffer_token_len])
-
                     # update tokens to the remaining tokens
                     self._token_buffer = self._token_buffer[max_buffer_token_len:]
-
                     input = x[:-1]
                     label = x[1:]
-
-                    if self.use_varlen_attn:
-                        boundaries_in_window = [
-                            b
-                            for b in self._boundary_buffer
-                            if b <= max_buffer_token_len
-                        ]
-
-                        cu_seqlens = torch.tensor(
-                            boundaries_in_window, dtype=torch.int32
-                        )
-
-                        self._boundary_buffer = [
-                            b - max_buffer_token_len
-                            for b in self._boundary_buffer
-                            if b > max_buffer_token_len
-                        ]
-
-                        if not self._boundary_buffer or self._boundary_buffer[0] != 0:
-                            self._boundary_buffer.insert(0, 0)
-
-                        cu_seqlens_input = cu_seqlens[cu_seqlens <= len(input)]
-                        if cu_seqlens_input[-1] != len(input):
-                            cu_seqlens_input = torch.cat(
-                                [
-                                    cu_seqlens_input,
-                                    torch.tensor([len(input)], dtype=torch.int32),
-                                ]
-                            )
-
-                        seq_lengths = torch.diff(cu_seqlens_input)
-                        max_seqlen = (
-                            seq_lengths.max().item()
-                            if len(seq_lengths) > 0
-                            else self.seq_len
-                        )
-
-                        yield {
-                            "input": input,
-                            "cu_seq_q": cu_seqlens_input,
-                            "cu_seq_k": cu_seqlens_input,
-                            "max_q": max_seqlen,
-                            "max_k": max_seqlen,
-                        }, label
-                    else:
-                        yield {"input": input}, label
+                    yield {"input": input}, label
 
             if not self.infinite:
                 logger.warning(f"Dataset {self.dataset_name} has run out of data")
@@ -256,7 +145,6 @@ def __iter__(self):
 
     def load_state_dict(self, state_dict):
         self._token_buffer = state_dict["token_buffer"]
-        self._boundary_buffer = state_dict.get("boundary_buffer", [0])
 
         if isinstance(self._data, Dataset):
             self._sample_idx = state_dict["sample_idx"]
@@ -265,10 +153,7 @@ def load_state_dict(self, state_dict):
             self._data.load_state_dict(state_dict["data"])
 
     def state_dict(self):
-        _state_dict = {
-            "token_buffer": self._token_buffer,
-            "boundary_buffer": self._boundary_buffer,
-        }
+        _state_dict = {"token_buffer": self._token_buffer}
 
         if isinstance(self._data, Dataset):
             _state_dict["sample_idx"] = self._sample_idx
@@ -293,11 +178,6 @@ def build_text_dataloader(
     batch_size = job_config.training.local_batch_size
     seq_len = job_config.training.seq_len
 
-    model_args = train_spec.get_train_spec(job_config.model.name).model_args[
-        job_config.model.flavor
-    ]
-    use_varlen_attn = getattr(model_args, "use_varlen_attn", False)
-
     hf_ds = HuggingFaceTextDataset(
         dataset_name=dataset_name,
         dataset_path=dataset_path,
@@ -307,16 +187,12 @@ def build_text_dataloader(
         dp_world_size=dp_world_size,
         infinite=infinite,
     )
-    hf_ds.use_varlen_attn = use_varlen_attn
-
-    collate_fn = varlen_collate_fn if use_varlen_attn else None
 
     return ParallelAwareDataloader(
         dataset=hf_ds,
         dp_rank=dp_rank,
         dp_world_size=dp_world_size,
         batch_size=batch_size,
-        collate_fn=collate_fn,
     )
 
 
diff --git a/torchtitan/models/attention.py b/torchtitan/models/attention.py
@@ -20,7 +20,7 @@
     flex_attention,
 )
 
-from torch.nn.attention.varlen import varlen_attn
+from torch.nn.attention.varlen import varlen_attn, VarlenMetadata
 
 
 __all__ = [
@@ -251,3 +251,57 @@ def create_attention_mask(*args, **kwargs):
     arguments.
     """
     return _compiled_create_block_mask(*args, **kwargs)
+
+
+def create_varlen_cu_seqs(input_batch: torch.Tensor, eos_id: int) -> VarlenMetadata:
+    """
+    Creates cumulative sequence length indices needed for variable length attention
+
+    Args:
+        input_batch
+        eos_id: the EOS id marker
+
+    Returns:
+        VarlenMetadata containing cumulative sequence length indices for q, k, and max_seq_len
+    """
+    batch_size, seq_len = input_batch.shape
+    device = input_batch.device
+    cu_seqlens_list, all_seq_lengths = [], []
+    offset = 0
+    max_seqlen = 0
+
+    for b in range(batch_size):
+        tokens = input_batch[b]
+        eos_positions = (tokens == eos_id).nonzero(as_tuple=True)[0].to(torch.int32)
+        sample_cu_seqlens = torch.cat(
+            [
+                torch.tensor([0], dtype=torch.int32, device=device),
+                eos_positions + 1,
+                torch.tensor([seq_len], dtype=torch.int32, device=device),
+            ]
+        )
+        sample_cu_seqlens = torch.unique_consecutive(sample_cu_seqlens)
+
+        seq_lengths = torch.diff(sample_cu_seqlens)
+        all_seq_lengths.append(seq_lengths)
+
+        cu_seqlens_adjusted = sample_cu_seqlens[:-1] + offset
+        cu_seqlens_list.append(cu_seqlens_adjusted)
+
+        offset += seq_len
+
+    packed_cu_seqlens = torch.cat(
+        cu_seqlens_list + [torch.tensor([offset], dtype=torch.int32, device=device)]
+    )
+
+    max_seqlen = 0
+    if len(all_seq_lengths) > 0:
+        all_seq_lengths = torch.cat(all_seq_lengths)
+        max_seqlen = all_seq_lengths.max().item()
+
+    return VarlenMetadata(
+        cu_seq_q=packed_cu_seqlens,
+        cu_seq_k=packed_cu_seqlens,
+        max_q=max_seqlen,
+        max_k=max_seqlen,
+    )
diff --git a/torchtitan/models/llama3/__init__.py b/torchtitan/models/llama3/__init__.py
@@ -46,6 +46,7 @@
         vocab_size=2048,
         rope_theta=500000,
         use_varlen_attn=True,
+        attn_mask_type="varlen_attn",
     ),
     "8B": TransformerModelArgs(
         dim=4096,
@@ -76,6 +77,7 @@
         multiple_of=1024,
         rope_theta=500000,
         use_varlen_attn=True,
+        attn_mask_type="varlen_attn",
     ),
     "70B": TransformerModelArgs(
         dim=8192,
diff --git a/torchtitan/models/llama3/model/model.py b/torchtitan/models/llama3/model/model.py
@@ -13,11 +13,12 @@
 from torch import nn
 from torch.nn.attention.flex_attention import and_masks, BlockMask
 
-from torch.nn.attention.varlen import varlen_attn
+from torch.nn.attention.varlen import varlen_attn, VarlenMetadata
 
 from torchtitan.components.tokenizer import BaseTokenizer
 from torchtitan.models.attention import (
     create_attention_mask,
+    create_varlen_cu_seqs,
     FlexAttentionWrapper,
     get_causal_mask_mod,
     get_document_mask_mod,
@@ -227,6 +228,7 @@ def forward(
         """
 
         bs, seqlen, _ = x.shape
+
         xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
 
         # Use -1 instead of `n_heads` (or `n_kv_heads`) to infer the actual
@@ -236,20 +238,7 @@ def forward(
         xk = xk.view(bs, seqlen, -1, self.head_dim)
         xv = xv.view(bs, seqlen, -1, self.head_dim)
 
-        if self.use_varlen_attn:
-            true_seq_len = freqs_cis.shape[0]
-            total_tokens = xq.shape[1]
-
-            true_bs = total_tokens // true_seq_len
-            xq = xq.view(true_bs, true_seq_len, -1, self.head_dim)
-            xk = xk.view(true_bs, true_seq_len, -1, self.head_dim)
-
-            xq, xk = apply_rotary_emb(xq, xk, freqs_cis)
-
-            xq = xq.view(1, total_tokens, -1, self.head_dim)
-            xk = xk.view(1, total_tokens, -1, self.head_dim)
-        else:
-            xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
+        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
 
         # repeat k/v heads if n_kv_heads < n_heads
         keys = repeat_kv(xk, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
@@ -259,18 +248,16 @@ def forward(
         xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
         xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
 
-        assert (
-            isinstance(attention_masks, BlockMask) or attention_masks is None
-        ), attention_masks
-
         if self.use_flex_attn:
             assert isinstance(attention_masks, BlockMask), attention_masks
             output = self.inner_attention(xq, xk, xv, block_mask=attention_masks)
         elif self.use_varlen_attn:
-            cu_seq_q = kwargs.get("cu_seq_q")
-            cu_seq_k = kwargs.get("cu_seq_k")
-            max_q = kwargs.get("max_q")
-            max_k = kwargs.get("max_k")
+            assert isinstance(attention_masks, VarlenMetadata), attention_masks
+
+            cu_seq_q = attention_masks.cu_seq_q
+            cu_seq_k = attention_masks.cu_seq_k
+            max_q = attention_masks.max_q
+            max_k = attention_masks.max_k
 
             n_local_heads = xq.shape[1]
             xq_packed = (
@@ -515,6 +502,8 @@ def get_attention_masks(
             case "block_causal":
                 B = input_batch.shape[0]
                 mask_mods.append(get_document_mask_mod(input_batch, tokenizer.eos_id))
+            case "varlen_attn":
+                return create_varlen_cu_seqs(input_batch, tokenizer.eos_id)
             case _:
                 raise ValueError(
                     f"Unknown attention mask type: {self.model_args.attn_mask_type}"
diff --git a/torchtitan/models/llama3/train_configs/llama3_8b_varlen.toml b/torchtitan/models/llama3/train_configs/llama3_8b_varlen.toml
@@ -16,7 +16,7 @@ save_tb_folder = "tb"
 
 [model]
 name = "llama3"
-flavor = "8B"
+flavor = "8B_varlen"
 hf_assets_path = "./assets/hf/Llama-3.1-8B"
 # converters = ["float8"]
 
diff --git a/torchtitan/protocols/model.py b/torchtitan/protocols/model.py
@@ -12,13 +12,14 @@
 import torch.nn as nn
 
 from torch.nn.attention.flex_attention import BlockMask
+from torch.nn.attention.varlen import VarlenMetadata
 
 from torchtitan.components.tokenizer import BaseTokenizer
 
 from torchtitan.config import JobConfig
 
 
-AttentionMasksType = dict[str, BlockMask] | BlockMask
+AttentionMasksType = dict[str, BlockMask] | BlockMask | VarlenMetadata
 
 
 @dataclass
diff --git a/torchtitan/train.py b/torchtitan/train.py