correct loss/perf

liangel-02 · liangel-02 · commit af8e3e8836e8 · 2025-11-12T14:39:59.000-08:00
diff --git a/torchtitan/hf_datasets/text_datasets.py b/torchtitan/hf_datasets/text_datasets.py
@@ -79,7 +79,6 @@ def varlen_collate_fn(batch):
         Packed (input_dict, label) with collapsed batch dimension
     """
     if len(batch) == 1:
-        # Single sample - already packed
         input_dict, label = batch[0]
         return {
             "input": input_dict["input"].unsqueeze(0),  # [1, seq_len]
@@ -89,7 +88,6 @@ def varlen_collate_fn(batch):
             "max_k": input_dict["max_k"],
         }, label.unsqueeze(0)  # [1, seq_len]
 
-    # Multiple samples - pack them together
     inputs = []
     labels = []
     cu_seqlens_list = []
@@ -100,23 +98,17 @@ def varlen_collate_fn(batch):
         inputs.append(input_dict["input"])
         labels.append(label)
 
-        # Get cu_seqlens from this sample and adjust by offset
         cu_seqlens = input_dict["cu_seq_q"]
-        # Don't include the last boundary (we'll add it at the end)
         cu_seqlens_adjusted = cu_seqlens[:-1] + offset
         cu_seqlens_list.append(cu_seqlens_adjusted)
 
-        # Track maximum sequence length across all samples
         max_seqlen = max(max_seqlen, input_dict["max_q"])
 
-        # Update offset for next sample
         offset += len(input_dict["input"])
 
-    # Concatenate all inputs and labels
-    packed_input = torch.cat(inputs, dim=0).unsqueeze(0)  # Shape: [total_tokens]
-    packed_label = torch.cat(labels, dim=0).unsqueeze(0)  # Shape: [total_tokens]
+    packed_input = torch.cat(inputs, dim=0).unsqueeze(0)  # shape: [1, total_tokens]
+    packed_label = torch.cat(labels, dim=0).unsqueeze(0)  # shape: [1, total_tokens]
 
-    # Combine all cu_seqlens and add final boundary
     packed_cu_seqlens = torch.cat(
         cu_seqlens_list + [torch.tensor([offset], dtype=torch.int32)]
     )
@@ -189,7 +181,6 @@ def __iter__(self):
 
                 # marks where this current document ends
                 if self.use_varlen_attn:
-                # if self.use_varlen_attn or self.use_flex_attn:
                     self._boundary_buffer.append(len(self._token_buffer))
 
                 while len(self._token_buffer) >= max_buffer_token_len:
@@ -198,19 +189,16 @@ def __iter__(self):
                     # update tokens to the remaining tokens
                     self._token_buffer = self._token_buffer[max_buffer_token_len:]
 
-                    input = x[:-1] # print device here
+                    input = x[:-1]
                     label = x[1:]
 
                     if self.use_varlen_attn:
-                    # if self.use_varlen_attn or self.use_flex_attn:
                         boundaries_in_window = [
                             b for b in self._boundary_buffer
                             if b <= max_buffer_token_len
                         ]
 
                         cu_seqlens = torch.tensor(boundaries_in_window, dtype=torch.int32)
-                        # print device here
-
 
                         self._boundary_buffer = [
                             b - max_buffer_token_len
diff --git a/torchtitan/models/llama3/__init__.py b/torchtitan/models/llama3/__init__.py
@@ -55,9 +55,9 @@
         ffn_dim_multiplier=1.3,
         multiple_of=1024,
         rope_theta=500000,
-        # use_flex_attn=True,
-        # attn_mask_type="block_causal",
-        use_varlen_attn=True,
+        use_flex_attn=True,
+        attn_mask_type="block_causal",
+        # use_varlen_attn=True,
     ),
     "70B": TransformerModelArgs(
         dim=8192,
diff --git a/torchtitan/models/llama3/model/model.py b/torchtitan/models/llama3/model/model.py
@@ -134,8 +134,10 @@ def apply_rotary_emb(
     Returns:
         tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
     """
+
     xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
     xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+
     freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
     xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
     xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
@@ -282,7 +284,18 @@ def forward(
             cu_seq_q = kwargs.get("cu_seq_q_list")
             assert(cu_seq_q is not None)
             assert(type(cu_seq_q) is list)
-            xq, xk = self._apply_rotary_per_sequence(xq, xk, freqs_cis, cu_seq_q)
+
+            true_seq_len = freqs_cis.shape[0]
+            total_tokens = xq.shape[1]
+
+            true_bs = total_tokens // true_seq_len
+            xq = xq.view(true_bs, true_seq_len, -1, self.head_dim)
+            xk = xk.view(true_bs, true_seq_len, -1, self.head_dim)
+
+            xq, xk = apply_rotary_emb(xq, xk, freqs_cis)
+
+            xq = xq.view(1, total_tokens, -1, self.head_dim)
+            xk = xk.view(1, total_tokens, -1, self.head_dim)
         else:
             xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
 
diff --git a/torchtitan/models/llama3/train_configs/debug_model.toml b/torchtitan/models/llama3/train_configs/debug_model.toml
@@ -19,7 +19,7 @@ enable_wandb = false
 
 [model]
 name = "llama3"
-flavor = "debugmodel_varlen_attn"
+flavor = "debugmodel_flex_attn"
 # flavor = "debugmodel_flex_attn"
 # test folder with tokenizer.json, for debug purpose only
 hf_assets_path = "./tests/assets/tokenizer"
diff --git a/torchtitan/models/llama3/train_configs/llama3_8b.toml b/torchtitan/models/llama3/train_configs/llama3_8b.toml
@@ -6,7 +6,7 @@ description = "Llama 3 8B training"
 
 [profiling]
 enable_profiling = true
-save_traces_folder = "profile_trace"
+save_traces_folder = "flex_profile_trace"
 profile_freq = 100
 
 [metrics]
@@ -32,7 +32,7 @@ warmup_steps = 200  # lr scheduler warm up
 local_batch_size = 1
 seq_len = 8192
 max_norm = 1.0  # grad norm clipping
-steps = 100
+steps = 1000
 dataset = "c4"
 
 [parallelism]
diff --git a/torchtitan/tools/profiling.py b/torchtitan/tools/profiling.py
@@ -76,7 +76,7 @@ def trace_handler(prof):
             schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active),
             on_trace_ready=trace_handler,
             record_shapes=True,
-            # with_stack=True, # python stack
+            with_stack=True, # python stack
         ) as torch_profiler:
             torch_profiler.step_num = global_step
             yield torch_profiler