From 3cf19c08e1e43e07ffaf45bdb77d49f9ae86b0c4 Mon Sep 17 00:00:00 2001
From: Ammar Ahmad Awan <ammar.awan@microsoft.com>
Date: Wed, 9 Aug 2023 17:36:50 -0500
Subject: [PATCH 1/3] Reorganize scripts into model-family first fashion.

---
 .../training_scripts/{single_node => }/llama2/run_llama2_7b.sh    | 0
 .../{single_node => }/llama2/run_llama2_7b_lora.sh                | 0
 .../training_scripts/{ => opt}/multi_node/run_66b.sh              | 0
 .../training_scripts/{ => opt}/single_gpu/run_1.3b.sh             | 0
 .../training_scripts/{ => opt}/single_gpu/run_6.7b_lora.sh        | 0
 .../{single_node/opt => opt/single_node}/run_1.3b.sh              | 0
 .../{single_node/opt => opt/single_node}/run_1.3b_lora.sh         | 0
 .../{single_node/opt => opt/single_node}/run_13b.sh               | 0
 .../{single_node/opt => opt/single_node}/run_30b_lora.sh          | 0
 .../{single_node/opt => opt/single_node}/run_6.7b.sh              | 0
 .../training_scripts/{ => opt}/single_node/sweep/README.md        | 0
 .../training_scripts/{ => opt}/single_node/sweep/run_single.sh    | 0
 .../{ => opt}/single_node/sweep/run_step3_sweep.sh                | 0
 13 files changed, 0 insertions(+), 0 deletions(-)
 rename applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/{single_node => }/llama2/run_llama2_7b.sh (100%)
 rename applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/{single_node => }/llama2/run_llama2_7b_lora.sh (100%)
 rename applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/{ => opt}/multi_node/run_66b.sh (100%)
 rename applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/{ => opt}/single_gpu/run_1.3b.sh (100%)
 rename applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/{ => opt}/single_gpu/run_6.7b_lora.sh (100%)
 rename applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/{single_node/opt => opt/single_node}/run_1.3b.sh (100%)
 rename applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/{single_node/opt => opt/single_node}/run_1.3b_lora.sh (100%)
 rename applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/{single_node/opt => opt/single_node}/run_13b.sh (100%)
 rename applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/{single_node/opt => opt/single_node}/run_30b_lora.sh (100%)
 rename applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/{single_node/opt => opt/single_node}/run_6.7b.sh (100%)
 rename applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/{ => opt}/single_node/sweep/README.md (100%)
 rename applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/{ => opt}/single_node/sweep/run_single.sh (100%)
 rename applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/{ => opt}/single_node/sweep/run_step3_sweep.sh (100%)

diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/llama2/run_llama2_7b.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/llama2/run_llama2_7b.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b.sh
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/llama2/run_llama2_7b_lora.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/llama2/run_llama2_7b_lora.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/multi_node/run_66b.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/multi_node/run_66b.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/multi_node/run_66b.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/multi_node/run_66b.sh
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_gpu/run_1.3b.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_gpu/run_1.3b.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_gpu/run_1.3b.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_gpu/run_1.3b.sh
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_gpu/run_6.7b_lora.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_gpu/run_6.7b_lora.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_gpu/run_6.7b_lora.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_gpu/run_6.7b_lora.sh
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/opt/run_1.3b.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_1.3b.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/opt/run_1.3b.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_1.3b.sh
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/opt/run_1.3b_lora.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_1.3b_lora.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/opt/run_1.3b_lora.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_1.3b_lora.sh
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/opt/run_13b.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_13b.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/opt/run_13b.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_13b.sh
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/opt/run_30b_lora.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_30b_lora.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/opt/run_30b_lora.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_30b_lora.sh
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/opt/run_6.7b.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_6.7b.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/opt/run_6.7b.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_6.7b.sh
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/sweep/README.md b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/README.md
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/sweep/README.md
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/README.md
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/sweep/run_single.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/run_single.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/sweep/run_single.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/run_single.sh
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/sweep/run_step3_sweep.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/run_step3_sweep.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/sweep/run_step3_sweep.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/run_step3_sweep.sh

From 8ed4067df67fa0af4f6e9b37b869baf8c4542570 Mon Sep 17 00:00:00 2001
From: Ammar Ahmad Awan <ammar.awan@microsoft.com>
Date: Fri, 25 Aug 2023 14:38:50 -0500
Subject: [PATCH 2/3] Update calculations. Include critic model flops.

---
 .../training/step3_rlhf_finetuning/main.py    |   3 +-
 .../DeepSpeed-Chat/training/utils/perf.py     | 124 +++++++++++-------
 2 files changed, 79 insertions(+), 48 deletions(-)

diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py
index 7b36e5e48..f0ca1fb22 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py
@@ -540,7 +540,8 @@ def main():
                 print_rank_0(
                     f'Epoch: {epoch} | Step: {step} | PPO Epoch: {ppo_ep+1} | Actor Loss: {actor_loss_sum/inner_iter} | Critic Loss: {critic_loss_sum/inner_iter} | Unsupervised Loss: {unsup_loss_sum/inner_iter}',
                     args.global_rank)
-                print_throughput_step3(rlhf_engine.actor.model, args, e2e_time,
+                print_throughput_step3(rlhf_engine.actor.model,
+                                       rlhf_engine.critic, args, e2e_time,
                                        trainer.generate_time, training_time,
                                        args.global_rank)
                 average_reward = get_all_reduce_mean(average_reward).item()
diff --git a/applications/DeepSpeed-Chat/training/utils/perf.py b/applications/DeepSpeed-Chat/training/utils/perf.py
index ff8ac4eb8..94b091cd0 100644
--- a/applications/DeepSpeed-Chat/training/utils/perf.py
+++ b/applications/DeepSpeed-Chat/training/utils/perf.py
@@ -10,14 +10,7 @@
 def print_throughput(hf_model, args, e2e_time, rank=0):
     if rank <= 0:
         hf_config = hf_model.config
-        num_layers = getattr(hf_config, "num_hidden_layers",
-                             getattr(hf_config, "n_layer", None))
-        hidden_size = getattr(hf_config, "hidden_size",
-                              getattr(hf_config, "n_embd", None))
-        vocab_size = getattr(hf_config, "vocab_size", None)
-        assert all(
-            (num_layers, hidden_size, vocab_size)
-        ), "Could not determine number of layers, hidden size, and vocab size of the model"
+        num_layers, hidden_size, vocab_size = get_hf_configs(hf_config)
 
         gpus_per_model = torch.distributed.get_world_size()
         seq_length = args.max_seq_len
@@ -31,12 +24,9 @@ def print_throughput(hf_model, args, e2e_time, rank=0):
         params_in_billions = hf_model._num_params / (1e9)
 
         # Megatron paper's formula to calculate training flops
-        train_flops_per_iteration = (
-            24 * checkpoint_activations_factor * batch_size * seq_length *
-            num_layers *
-            (hidden_size**2)) * (1.0 + (seq_length / (6.0 * hidden_size)) +
-                                 (vocab_size /
-                                  (16.0 * num_layers * hidden_size)))
+        train_flops_per_iteration = calculate_flops(
+            checkpoint_activations_factor, batch_size, seq_length, num_layers,
+            hidden_size, vocab_size)
 
         train_tflops = train_flops_per_iteration / (e2e_time * gpus_per_model *
                                                     (10**12))
@@ -48,48 +38,58 @@ def print_throughput(hf_model, args, e2e_time, rank=0):
 
 
 # Enhanced version of the function above that provides calculations and printing for Step 3
-def print_throughput_step3(hf_model,
+def print_throughput_step3(actor_model,
+                           critic_model,
                            args,
                            e2e_time,
                            gen_exp_time,
                            train_time,
                            rank=0):
     if rank <= 0:
-        hf_config = hf_model.config
-        num_layers = getattr(hf_config, "num_hidden_layers",
-                             getattr(hf_config, "n_layer", None))
-        hidden_size = getattr(hf_config, "hidden_size",
-                              getattr(hf_config, "n_embd", None))
-        vocab_size = getattr(hf_config, "vocab_size", None)
-        assert all(
-            (num_layers, hidden_size, vocab_size)
-        ), "Could not determine number of layers, hidden size, and vocab size of the model"
+        hf_config = actor_model.config
+        num_layers, hidden_size, vocab_size = get_hf_configs(hf_config)
 
         gpus_per_model = torch.distributed.get_world_size()
         seq_length = args.max_answer_seq_len + args.max_prompt_seq_len
         batch_size = args.per_device_generation_batch_size * args.generation_batches * args.ppo_epochs * gpus_per_model * 1 if args.unsupervised_dataset_name is None else 2
         samples_per_second = batch_size / e2e_time
-        checkpoint_activations_factor = 4 if args.actor_gradient_checkpointing else 3
-        hf_model._num_params = sum([
+
+        actor_checkpoint_activations_factor = 4 if args.actor_gradient_checkpointing else 3
+        critic_checkpoint_activations_factor = 4 if args.critic_gradient_checkpointing else 3
+        if args.actor_lora_dim > 0:
+            k = args.actor_lora_dim * 2 / hidden_size
+            actor_checkpoint_activations_factor -= (1 - k)
+        if args.critic_lora_dim > 0:
+            k = args.critic_lora_dim * 2 / hidden_size
+            critic_checkpoint_activations_factor -= (1 - k)
+
+        actor_model._num_params = sum([
             p.ds_numel if hasattr(p, "ds_tensor") else p.numel()
-            for p in hf_model.parameters()
+            for p in actor_model.parameters()
         ])
-        params_in_billions = hf_model._num_params / (1e9)
+        actor_params_in_billions = actor_model._num_params / (1e9)
+
+        critic_model._num_params = sum([
+            p.ds_numel if hasattr(p, "ds_tensor") else p.numel()
+            for p in critic_model.parameters()
+        ])
+        critic_params_in_billions = critic_model._num_params / (1e9)
 
         # Megatron paper's formula to calculate training flops
-        train_flops_per_iteration = (
-            24 * checkpoint_activations_factor * batch_size * seq_length *
-            num_layers *
-            (hidden_size**2)) * (1.0 + (seq_length / (6.0 * hidden_size)) +
-                                 (vocab_size /
-                                  (16.0 * num_layers * hidden_size)))
+        actor_train_flops_per_iteration = calculate_flops(
+            actor_checkpoint_activations_factor, batch_size, seq_length,
+            num_layers, hidden_size, vocab_size)
+        critic_train_flops_per_iteration = calculate_flops(
+            critic_checkpoint_activations_factor, batch_size, seq_length,
+            num_layers, hidden_size, vocab_size)
 
-        train_tflops = train_flops_per_iteration / (train_time *
-                                                    gpus_per_model * (10**12))
+        total_train_flops = actor_train_flops_per_iteration + critic_train_flops_per_iteration
+        train_tflops = total_train_flops / (train_time * gpus_per_model *
+                                            (10**12))
 
         gen_bs = args.per_device_generation_batch_size * gpus_per_model
 
-        # Modified formula for calculating flops in forward pass only
+        # Modified formula for calculating flops in the forward pass only
         gen_flops_per_iteration = (
             24 * gen_bs * seq_length * num_layers *
             (hidden_size**2)) * (1.0 + (seq_length / (6.0 * hidden_size)) +
@@ -99,28 +99,58 @@ def print_throughput_step3(hf_model,
         gen_tflops = gen_flops_per_iteration / (gen_exp_time * gpus_per_model *
                                                 (10**12))
 
-        if hf_config.torch_dtype == "float16":
+        if hf_config.torch_dtype == torch.float16:
             num_bytes = 2
-        elif hf_config.torch_dtype == "float32":
+        elif hf_config.torch_dtype == torch.float32:
             num_bytes = 4
         else:
-            num_bytes = 1
+            num_bytes = -1
 
-        gen_bw = (hf_model._num_params *
-                  (num_bytes / 1e9)) / gen_exp_time * args.max_answer_seq_len
+        print(
+            f"{num_bytes=}, {hf_config.torch_dtype=}, {actor_model._num_params=}"
+        )
+        pertok_lat = gen_exp_time / args.max_answer_seq_len
+        gen_bw = 1 / pertok_lat * actor_model._num_params * num_bytes / 1e9
 
-        total_flops_per_iteration = train_flops_per_iteration + gen_flops_per_iteration * args.generation_batches
+        total_flops_per_iteration = total_train_flops + gen_flops_per_iteration * args.generation_batches
         total_tflops = total_flops_per_iteration / (e2e_time * gpus_per_model *
                                                     (10**12))
 
         print(
-            f"End-to-End => Latency: {e2e_time:.2f}s, TFLOPs: {total_tflops:.2f}, Samples/sec: {samples_per_second:.2f}, Time/seq {e2e_time/batch_size:.2f}s, Batch Size: {batch_size}, Sequence Length: {seq_length}"
+            f"End-to-End => Latency: {e2e_time:.2f}s, TFLOPs: {total_tflops:.2f}, Samples/sec: {samples_per_second:.2f}, Time/seq {e2e_time/batch_size:.2f}s, Batch Size: {batch_size}, Total Seq. Length: {seq_length}"
         )
         print(
-            f"Generation => Latency: {gen_exp_time:.2f}s, TFLOPs: {gen_tflops:.2f}, BW: {gen_bw:.2f} GB/sec"
+            f"Generation => Latency: {gen_exp_time:.2f}s, Per-token Latency {pertok_lat*1000:.2f} ms, TFLOPs: {gen_tflops:.2f}, BW: {gen_bw if num_bytes > 0 else num_bytes:.2f} GB/sec, Answer Seq. Length: {args.max_answer_seq_len}"
         )
         print(
             f"Training   => Latency: {train_time:.2f}s, TFLOPs: {train_tflops:.2f}"
         )
-        param_string = f"{params_in_billions:.3f} B" if params_in_billions != 0 else "NA"
-        print(f"Parameters => {param_string}")
+        actor_param_string = f"{actor_params_in_billions:.3f} B" if actor_params_in_billions != 0 else "NA"
+        critic_param_string = f"{critic_params_in_billions:.3f} B" if critic_params_in_billions != 0 else "NA"
+        print(
+            f"Actor Model Parameters => {actor_param_string}, Critic Model Parameters => {critic_param_string}"
+        )
+
+
+# Helper function to calculate FLOPs using the Megatron-LM paper's formula
+def calculate_flops(checkpoint_activations_factor, batch_size, seq_length,
+                    num_layers, hidden_size, vocab_size):
+    flops_per_iteration = (24 * checkpoint_activations_factor * batch_size *
+                           seq_length * num_layers * (hidden_size**2)) * (
+                               1.0 + (seq_length / (6.0 * hidden_size)) +
+                               (vocab_size /
+                                (16.0 * num_layers * hidden_size)))
+    return flops_per_iteration
+
+
+def get_hf_configs(hf_config):
+    num_layers = getattr(hf_config, "num_hidden_layers",
+                         getattr(hf_config, "n_layer", None))
+    hidden_size = getattr(hf_config, "hidden_size",
+                          getattr(hf_config, "n_embd", None))
+    vocab_size = getattr(hf_config, "vocab_size", None)
+    assert all(
+        (num_layers, hidden_size, vocab_size)
+    ), "Could not determine number of layers, hidden size, and vocab size of the model"
+
+    return num_layers, hidden_size, vocab_size

From 672a97a833ca3a9b1b746245138afbd4bafd7628 Mon Sep 17 00:00:00 2001
From: Ammar Ahmad Awan <ammar.awan@microsoft.com>
Date: Fri, 25 Aug 2023 22:41:03 +0000
Subject: [PATCH 3/3] fix calcs.

---
 .../DeepSpeed-Chat/training/utils/perf.py     | 46 +++++++++++--------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/applications/DeepSpeed-Chat/training/utils/perf.py b/applications/DeepSpeed-Chat/training/utils/perf.py
index 94b091cd0..df57a2046 100644
--- a/applications/DeepSpeed-Chat/training/utils/perf.py
+++ b/applications/DeepSpeed-Chat/training/utils/perf.py
@@ -17,6 +17,10 @@ def print_throughput(hf_model, args, e2e_time, rank=0):
         batch_size = args.per_device_train_batch_size
         samples_per_second = batch_size / e2e_time
         checkpoint_activations_factor = 4 if args.gradient_checkpointing else 3
+        if args.lora_dim > 0:
+            k = args.lora_dim * 2 / hidden_size
+            checkpoint_activations_factor -= (1 - k)
+
         hf_model._num_params = sum([
             p.ds_numel if hasattr(p, "ds_tensor") else p.numel()
             for p in hf_model.parameters()
@@ -25,8 +29,7 @@ def print_throughput(hf_model, args, e2e_time, rank=0):
 
         # Megatron paper's formula to calculate training flops
         train_flops_per_iteration = calculate_flops(
-            checkpoint_activations_factor, batch_size, seq_length, num_layers,
-            hidden_size, vocab_size)
+            checkpoint_activations_factor, batch_size, seq_length, hf_config)
 
         train_tflops = train_flops_per_iteration / (e2e_time * gpus_per_model *
                                                     (10**12))
@@ -46,8 +49,15 @@ def print_throughput_step3(actor_model,
                            train_time,
                            rank=0):
     if rank <= 0:
-        hf_config = actor_model.config
-        num_layers, hidden_size, vocab_size = get_hf_configs(hf_config)
+        # Actor model passed here is a HF model.
+        actor_hf_config = actor_model.config
+        # Critic model passed here is  a DeepSpeed Engine. The module inside is the Reward model (that wraps a HF model).
+        critic_hf_config = critic_model.module.config
+
+        actor_num_layers, actor_hidden_size, actor_vocab_size = get_hf_configs(
+            actor_hf_config)
+        critic_num_layers, critic_hidden_size, critic_vocab_size = get_hf_configs(
+            critic_hf_config)
 
         gpus_per_model = torch.distributed.get_world_size()
         seq_length = args.max_answer_seq_len + args.max_prompt_seq_len
@@ -57,10 +67,10 @@ def print_throughput_step3(actor_model,
         actor_checkpoint_activations_factor = 4 if args.actor_gradient_checkpointing else 3
         critic_checkpoint_activations_factor = 4 if args.critic_gradient_checkpointing else 3
         if args.actor_lora_dim > 0:
-            k = args.actor_lora_dim * 2 / hidden_size
+            k = args.actor_lora_dim * 2 / actor_hidden_size
             actor_checkpoint_activations_factor -= (1 - k)
         if args.critic_lora_dim > 0:
-            k = args.critic_lora_dim * 2 / hidden_size
+            k = args.critic_lora_dim * 2 / critic_hidden_size
             critic_checkpoint_activations_factor -= (1 - k)
 
         actor_model._num_params = sum([
@@ -76,12 +86,13 @@ def print_throughput_step3(actor_model,
         critic_params_in_billions = critic_model._num_params / (1e9)
 
         # Megatron paper's formula to calculate training flops
+
         actor_train_flops_per_iteration = calculate_flops(
             actor_checkpoint_activations_factor, batch_size, seq_length,
-            num_layers, hidden_size, vocab_size)
+            actor_hf_config)
         critic_train_flops_per_iteration = calculate_flops(
             critic_checkpoint_activations_factor, batch_size, seq_length,
-            num_layers, hidden_size, vocab_size)
+            critic_hf_config)
 
         total_train_flops = actor_train_flops_per_iteration + critic_train_flops_per_iteration
         train_tflops = total_train_flops / (train_time * gpus_per_model *
@@ -91,24 +102,22 @@ def print_throughput_step3(actor_model,
 
         # Modified formula for calculating flops in the forward pass only
         gen_flops_per_iteration = (
-            24 * gen_bs * seq_length * num_layers *
-            (hidden_size**2)) * (1.0 + (seq_length / (6.0 * hidden_size)) +
-                                 (vocab_size /
-                                  (16.0 * num_layers * hidden_size)))
+            24 * gen_bs * seq_length * actor_num_layers *
+            (actor_hidden_size**2)) * (
+                1.0 + (seq_length / (6.0 * actor_hidden_size)) +
+                (actor_vocab_size /
+                 (16.0 * actor_num_layers * actor_hidden_size)))
 
         gen_tflops = gen_flops_per_iteration / (gen_exp_time * gpus_per_model *
                                                 (10**12))
 
-        if hf_config.torch_dtype == torch.float16:
+        if actor_hf_config.torch_dtype == torch.float16:
             num_bytes = 2
-        elif hf_config.torch_dtype == torch.float32:
+        elif actor_hf_config.torch_dtype == torch.float32:
             num_bytes = 4
         else:
             num_bytes = -1
 
-        print(
-            f"{num_bytes=}, {hf_config.torch_dtype=}, {actor_model._num_params=}"
-        )
         pertok_lat = gen_exp_time / args.max_answer_seq_len
         gen_bw = 1 / pertok_lat * actor_model._num_params * num_bytes / 1e9
 
@@ -134,7 +143,8 @@ def print_throughput_step3(actor_model,
 
 # Helper function to calculate FLOPs using the Megatron-LM paper's formula
 def calculate_flops(checkpoint_activations_factor, batch_size, seq_length,
-                    num_layers, hidden_size, vocab_size):
+                    hf_config):
+    num_layers, hidden_size, vocab_size = get_hf_configs(hf_config)
     flops_per_iteration = (24 * checkpoint_activations_factor * batch_size *
                            seq_length * num_layers * (hidden_size**2)) * (
                                1.0 + (seq_length / (6.0 * hidden_size)) +