pytorch
diff --git a/‎examples/dynamo/cache_utils.py‎ ‎examples/dynamo/llm/cache_utils.py‎examples/dynamo/cache_utils.py renamed to examples/dynamo/llm/cache_utils.py b/‎examples/dynamo/cache_utils.py‎ ‎examples/dynamo/llm/cache_utils.py‎examples/dynamo/cache_utils.py renamed to examples/dynamo/llm/cache_utils.py
diff --git a/‎examples/dynamo/dynamic_cache.py‎ ‎examples/dynamo/llm/dynamic_cache.py‎examples/dynamo/dynamic_cache.py renamed to examples/dynamo/llm/dynamic_cache.py b/‎examples/dynamo/dynamic_cache.py‎ ‎examples/dynamo/llm/dynamic_cache.py‎examples/dynamo/dynamic_cache.py renamed to examples/dynamo/llm/dynamic_cache.py
diff --git a/‎examples/dynamo/llama3_trt.py‎ ‎examples/dynamo/llm/run_llm.py‎examples/dynamo/llama3_trt.py renamed to examples/dynamo/llm/run_llm.py
Lines changed: 63 additions & 133 deletions b/‎examples/dynamo/llama3_trt.py‎ ‎examples/dynamo/llm/run_llm.py‎examples/dynamo/llama3_trt.py renamed to examples/dynamo/llm/run_llm.py
Lines changed: 63 additions & 133 deletions
diff --git a/‎examples/dynamo/static_cache.py‎ ‎examples/dynamo/llm/static_cache_v1.py‎examples/dynamo/static_cache.py renamed to examples/dynamo/llm/static_cache_v1.py
Lines changed: 11 additions & 6 deletions b/‎examples/dynamo/static_cache.py‎ ‎examples/dynamo/llm/static_cache_v1.py‎examples/dynamo/static_cache.py renamed to examples/dynamo/llm/static_cache_v1.py
Lines changed: 11 additions & 6 deletions
diff --git a/‎examples/dynamo/static_cache2.py‎ ‎examples/dynamo/llm/static_cache_v2.py‎examples/dynamo/static_cache2.py renamed to examples/dynamo/llm/static_cache_v2.py
Lines changed: 5 additions & 4 deletions b/‎examples/dynamo/static_cache2.py‎ ‎examples/dynamo/llm/static_cache_v2.py‎examples/dynamo/static_cache2.py renamed to examples/dynamo/llm/static_cache_v2.py
Lines changed: 5 additions & 4 deletions
@@ -19,63 +19,29 @@
 import torch_tensorrt
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from contextlib import nullcontext
-from utils import export_llm, generate, recordStats, time_generate, generate_with_kv_cache, get_zeroed_kv_cache_inputs
+from utils import export_llm, generate, recordStats, time_generate, generate_with_kv_cache
+import sys
+import os
 
+# Register SDPA as a standalone operator. Converter and lowering pass are defined in register_sdpa.py
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from register_sdpa import *
 
 DEVICE = torch.device("cuda:0")
 
 def get_model(args):
     with torch.no_grad():
-        if args.model == "meta-llama/Llama-2-7b-chat-hf":
-            model = (
-                AutoModelForCausalLM.from_pretrained(
-                    args.model,
-                    use_cache=False,
-                    attn_implementation="sdpa",
-                    num_hidden_layers=1
-                )
-                .eval()
-                .cuda()
-            )
-        elif args.model == "meta-llama/Llama-3.2-1B-Instruct":
-            model = (
-                AutoModelForCausalLM.from_pretrained(
-                    args.model,
-                    use_cache=False,
-                    attn_implementation="sdpa",
-                    num_hidden_layers=1
-                )
-                .eval()
-                .cuda()
-            )
-            
-        elif args.model == "meta-llama/Llama-3.2-3B-Instruct":
-            model = (
+        # Supported list of models:
+        # - meta-llama/Llama-3.2-1B-Instruct
+        # - meta-llama/Llama-3.2-3B-Instruct
+        # - meta-llama/Llama-3.1-8B-Instruct
+        # - Qwen/Qwen2.5-1.5B-Instruct
+        model = (
                 AutoModelForCausalLM.from_pretrained(
                     args.model,
                     use_cache=False,
                     attn_implementation="sdpa",
-                    # num_hidden_layers=2
-                )
-                .eval()
-                .cuda()
-            )
-        elif args.model == "meta-llama/Llama-3.1-8B-Instruct":
-            model = (
-                AutoModelForCausalLM.from_pretrained(
-                    args.model,
-                    use_cache=False,
-                    attn_implementation="sdpa",  # num_hidden_layers=1
-                )
-                .eval()
-                .cuda()
-            )
-        elif args.model == "google/gemma-3-1b-it":
-            model = (
-                AutoModelForCausalLM.from_pretrained(
-                    "google/gemma-3-1b-it", 
-                    use_cache=False, 
-                    attn_implementation="sdpa"
+                    # num_hidden_layers=1
                 )
                 .eval()
                 .cuda()
@@ -91,9 +57,9 @@ def get_model(args):
 
 
 def compile_torchtrt(model, input_ids, args):
-    max_seq_len = input_ids.shape[1] + args.max_tokens
+    max_seq_len = input_ids.shape[1] + args.num_tokens
     ep = export_llm(model, input_ids, max_seq_len=max_seq_len)
-    
+
     # Set precision specific flags
     use_fp32_acc = False 
     use_explicit_typing = False
@@ -119,6 +85,7 @@ def compile_torchtrt(model, input_ids, args):
             disable_tf32=True,
             use_python_runtime=True,
             debug=args.debug,
+            offload_module_to_cpu=True,
             min_block_size=args.min_block_size,
         )
 
@@ -170,23 +137,29 @@ def measure_perf(trt_model, input_signature, backend_name):
         "--model", type=str, default="meta-llama/Llama-3.2-1B-Instruct", help="Name of LLM model"
     )
     arg_parser.add_argument(
-        "--tokenizer_path",
+        "--tokenizer",
         type=str,
-        default="meta-llama/Llama-3.2-1B-Instruct",
+        default="",
         help="Name of LLM model tokenizer",
     )
     arg_parser.add_argument(
         "--prompt", type=str, default="What is parallel programming ?", help="Prompt"
     )
-    arg_parser.add_argument("--precision", type=str, default="FP16", help="Prompt")
+    arg_parser.add_argument("--precision", type=str, default="FP16", help="Precision to use in the model. Options: FP16, BF16, FP32")
     arg_parser.add_argument(
         "--iterations", type=int, default=5, help="no. of iterations to run"
     )
     arg_parser.add_argument(
         "--min_block_size", type=int, default=1, help="no. of iterations to run"
     )
     arg_parser.add_argument(
-        "--max_tokens", type=int, default=128, help="no. of max tokens to be generated"
+        "--num_tokens", type=int, default=128, help="no. of output tokens to be generated"
+    )
+    arg_parser.add_argument(
+        "--batch_size", type=int, default=1, help="Batch size used for benchmarking"
+    )
+    arg_parser.add_argument(
+        "--isl", type=int, default=2048, help="Input sequence length used for benchmarking"
     )
     arg_parser.add_argument(
         "--enable_pytorch_run", 
@@ -196,8 +169,8 @@ def measure_perf(trt_model, input_signature, backend_name):
     arg_parser.add_argument(
         "--cache",
         type=str,
-        default="static",
-        help="Type of KV cache to use",
+        default="",
+        help="Type of KV cache to use. Options: static_v1, static_v2, dynamic",
     )
     arg_parser.add_argument(
         "--cudagraph",
@@ -214,22 +187,24 @@ def measure_perf(trt_model, input_signature, backend_name):
         action="store_true",
         help="Enable benchmark (default: False)"
     )
+    
     args = arg_parser.parse_args()
     with torch.inference_mode():
         model = get_model(args)
 
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer or args.model)
 
-        prompt = "What is parallel programming ?"
-        # prompt = "What is the capital of France ?"
-        model_inputs = tokenizer(prompt, return_tensors="pt")
-        input_ids = model_inputs["input_ids"].to(DEVICE)
-        # Prepare input prompt
-        # word = "What"
-        # word_ids = tokenizer(word, return_tensors="pt").input_ids[0]  # Get the first (and only) sequence
-        # input_ids = word_ids.repeat(1024).unsqueeze(0).to(model.device)  # Add batch dimension and move to device
+        # Prepare input for benchmarking or evaluation
+        if args.benchmark:
+            input_ids = torch.randint(1, 10000, (args.batch_size, args.isl), dtype=torch.int64).to(model.device)
+            position_ids = torch.arange(input_ids.shape[1]).unsqueeze(0).to(DEVICE)
+        else:
+            model_inputs = tokenizer(args.prompt, return_tensors="pt")
+            input_ids = model_inputs["input_ids"].to(DEVICE)
+            position_ids = torch.arange(input_ids.shape[1]).unsqueeze(0).to(DEVICE)
+        
 
-        MAX_OUTPUT_SEQ_LENGTH = input_ids.shape[1] + args.max_tokens
+        MAX_OUTPUT_SEQ_LENGTH = input_ids.shape[1] + args.num_tokens
         # Pyt
         pyt_gen_tokens = None
         pyt_timings = None
@@ -238,7 +213,6 @@ def measure_perf(trt_model, input_signature, backend_name):
             pyt_gen_tokens = generate(
                 model, input_ids.clone(), MAX_OUTPUT_SEQ_LENGTH, tokenizer.eos_token_id
             )
-            
             if args.benchmark:
                 pyt_timings = time_generate(
                     generate,
@@ -249,71 +223,22 @@ def measure_perf(trt_model, input_signature, backend_name):
                     iterations=args.iterations,
                 )
                 pyt_stats = recordStats(
-                    "PyTorch", pyt_timings, args.precision, batch_size=1, compile_time_s=None
+                    "PyTorch", pyt_timings, args.precision, batch_size=args.batch_size, compile_time_s=None
                 )
 
-        # TRT
-        pyt_logits_tok1 = model.cuda()(input_ids)
-        next_tokens = torch.argmax(pyt_logits_tok1.logits[:, -1, :], dim=-1)
-        input_seq = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-        pyt_logits_tok2 = model.cuda()(input_seq)
-        from lower_sdpa import *
-        if args.cache == "static":
-            # This import is required to register static KV cache transformations as lowering passes
-            from static_cache2 import *
-            trt_model = compile_torchtrt(model, input_ids, args) 
-            kv_cache = get_zeroed_kv_cache_inputs(trt_model)
-
-            # First token generation
-            pyt_keys = torch.load("key.pt"); pyt_values = torch.load("value.pt")
-            trt_logits, key_cache, value_cache, trt_keys_1, trt_values_1 = trt_model(input_ids.clone(), True, *kv_cache, 0, input_ids.shape[1])
-            print(f"Diff between pyt and trt logits: {torch.mean(torch.abs(pyt_logits_tok1.logits - trt_logits))}")
-            print(f"Diff between pyt and trt keys: {torch.mean(torch.abs(pyt_keys - trt_keys_1))}")
-            print(f"Diff between pyt and trt keys in cache: {torch.mean(torch.abs(pyt_keys - key_cache[:, :, :-2, :]))}")
-            print(f"Diff between pyt and trt values: {torch.mean(torch.abs(pyt_values - trt_values_1))}")
-            print(f"Diff between pyt and trt values in cache: {torch.mean(torch.abs(pyt_values - value_cache[:, :, :-2, :]))}")
-            next_tokens = torch.argmax(trt_logits[:, -1, :], dim=-1)
-
-            # Second token generation
-            trt_logits_2, key_cache2, value_cache2, trt_keys_2, trt_values_2 = trt_model(next_tokens[:, None], False, key_cache.clone(), value_cache.clone(), input_ids.shape[1], input_ids.shape[1]+1)
-            pyt_keys2 = torch.load("key2.pt"); pyt_values2 = torch.load("value2.pt")
-            print(f"Diff between pyt and trt logits: {torch.mean(torch.abs(pyt_logits_tok2.logits[:, -1:, :] - trt_logits_2))}")
-            print(f"Diff between pyt and trt keys: {torch.mean(torch.abs(pyt_keys2[:, :, -2:-1, :] - trt_keys_2))}")
-            print(f"Diff between pyt and trt keys in cache: {torch.mean(torch.abs(pyt_keys2 - key_cache2[:, :, :-1, :]))}")
-            print(f"Diff between pyt and trt values: {torch.mean(torch.abs(pyt_values2[:, :, -2:-1, :] - trt_values_2))}")
-            print(f"Diff between pyt and trt values in cache: {torch.mean(torch.abs(pyt_values2 - value_cache2[:, :, :-1, :]))}")
-            breakpoint()
+        if args.cache == "static_v1":
+            # This import is required to register static v1 KV cache transformations as lowering passes
+            import static_cache_v1
+        if args.cache == "static_v2":
+            # This import is required to register static v2 KV cache transformations as lowering passes
+            import static_cache_v2
         elif args.cache == "dynamic":
-            from dynamic_cache import *
-            trt_model = compile_torchtrt(model, input_ids, args) 
-            breakpoint()
-            kv_cache = get_zeroed_kv_cache_inputs(trt_model)
-        else:
-            # pyt_logits = model.cuda()(input_ids.clone())
-            trt_model = compile_torchtrt(model, input_ids, args) 
-            # trt_logits = trt_model(input_ids.clone(), True)
-            # print(f"Diff between pyt and trt: {torch.mean(torch.abs(pyt_logits - trt_logits))}")
-            # print(f"Diff between pyt and trt logits: {torch.mean(torch.abs(pyt_logits.logits - trt_logits.logits))}")
-        if args.cache == "static":
-            if args.cudagraph:
-                # Run a decoding loop with prefill and generate phases so that the CUDAGraph is recorded for both of these phases.
-                # trt_input_signature = (input_ids.clone(),) + get_zeroed_kv_cache_inputs(trt_model)
-                torch_tensorrt.runtime.set_cudagraphs_mode(True)
-             
-            trt_gen_tokens = generate_with_kv_cache(
-                trt_model, input_ids.clone(), MAX_OUTPUT_SEQ_LENGTH, tokenizer.eos_token_id,
-                )
+            import dynamic_cache
 
-            if args.benchmark:
-                trt_timings = time_generate(
-                    generate_with_kv_cache,
-                    trt_model,
-                    input_ids.clone(),
-                    MAX_OUTPUT_SEQ_LENGTH,
-                    tokenizer.eos_token_id,
-                    iterations=args.iterations,
-                )
-        elif args.cache == "dynamic":
+
+        trt_model = compile_torchtrt(model, input_ids, args) 
+            
+        if args.cache == "static_v1" or args.cache == "static_v2" or args.cache == "dynamic":
             if args.cudagraph:
                 # Run a decoding loop with prefill and generate phases so that the CUDAGraph is recorded for both of these phases.
                 # trt_input_signature = (input_ids.clone(),) + get_zeroed_kv_cache_inputs(trt_model)
@@ -332,7 +257,6 @@ def measure_perf(trt_model, input_signature, backend_name):
                     tokenizer.eos_token_id,
                     iterations=args.iterations,
                 )
-
         else:
             trt_gen_tokens = generate(
                 trt_model, input_ids.clone(), MAX_OUTPUT_SEQ_LENGTH, tokenizer.eos_token_id,
@@ -349,14 +273,20 @@ def measure_perf(trt_model, input_signature, backend_name):
 
         if args.benchmark:
             trt_stats = recordStats(
-                "TensorRT", trt_timings, args.precision, batch_size=1, compile_time_s=None
+                "TensorRT", trt_timings, args.precision, batch_size=args.batch_size, compile_time_s=None
             )
 
-        if args.enable_pytorch_run: 
-            print_outputs("PyTorch", pyt_gen_tokens, tokenizer)
-        print_outputs("TensorRT", trt_gen_tokens, tokenizer)
+        
+        if not args.benchmark:
+            if args.enable_pytorch_run: 
+                print_outputs("PyTorch", pyt_gen_tokens, tokenizer)
+            
+            print_outputs("TensorRT", trt_gen_tokens, tokenizer)
 
-        if  args.benchmark:
+            if args.enable_pytorch_run: 
+                print(f"PyTorch and TensorRT outputs match: {torch.equal(pyt_gen_tokens, trt_gen_tokens)}")
+
+        if args.benchmark:
             if args.enable_pytorch_run:
                 print("=========PyTorch PERFORMANCE============ \n")
                 print(pyt_stats)
 
@@ -118,11 +118,15 @@ def get_static_tensor(tensor: torch.Tensor):
     start_idx_input.meta["val"] = start_idx_unbacked_symint
     end_idx_input.meta["val"] = end_idx_unbacked_symint
 
-    return kv_inputs, start_idx_input, end_idx_input
+    # Add is_causal as input
+    is_causal_input = add_graph_input(gm, "is_causal", True)
+    is_causal_input.meta["val"] = torch.tensor(True)
 
+    return kv_inputs, start_idx_input, end_idx_input, is_causal_input
 
 
-def insert_kv_slicing_before_sdpa(gm, incoming_keys_values: List[Tuple[torch.Tensor, torch.Tensor]], start_idx_input: Node, end_idx_input: Node):
+
+def insert_kv_slicing_before_sdpa(gm, incoming_keys_values: List[Tuple[torch.Tensor, torch.Tensor]], start_idx_input: Node, end_idx_input: Node, is_causal_input: Node):
     """
     Insert slicing operations before each scaled_dot_product_attention operation.
     """
@@ -133,7 +137,8 @@ def insert_kv_slicing_before_sdpa(gm, incoming_keys_values: List[Tuple[torch.Ten
             sdpa_nodes.append(node)
     kv_cache_for_graph = []
     for idx, sdpa_node in enumerate(sdpa_nodes):
-        q_node, k_node, v_node = sdpa_node.args[:3]
+        assert len(sdpa_node.args) == 6, f"SDPA node should have 6 arguments but got {len(sdpa_node.args)} arguments"
+        q_node, k_node, v_node, attn_mask, dropout_p, is_causal = sdpa_node.args
         incoming_key, incoming_value = incoming_keys_values[idx]
         kv_cache_for_sdpa_node = []
         new_keys_values = []
@@ -231,7 +236,7 @@ def insert_kv_slicing_before_sdpa(gm, incoming_keys_values: List[Tuple[torch.Ten
 
         kv_cache_for_graph.extend(kv_cache_for_sdpa_node)
 
-        sdpa_node.args = (q_node, new_keys_values[0], new_keys_values[1]) + sdpa_node.args[3:]
+        sdpa_node.args = (q_node, new_keys_values[0], new_keys_values[1]) + (attn_mask, dropout_p, is_causal_input)
 
     return gm, kv_cache_for_graph
 
@@ -243,11 +248,11 @@ def insert_kv_cache(
     """Insert KV cache ops in the graph"""
     """Perform insertion of kv-caches and attention kernel."""
     # Add static key and value as inputs to the graph
-    kv_inputs, start_idx_input, end_idx_input = add_kv_cache_inputs(gm, fixed_kv=True)
+    kv_inputs, start_idx_input, end_idx_input, is_causal_input = add_kv_cache_inputs(gm, fixed_kv=True)
 
     # Build and update the KV cache using computed KV inputs for current token and 
     # incoming keys and values from previous tokens (which were added as inputs)
-    gm, kv_cache_for_graph = insert_kv_slicing_before_sdpa(gm, kv_inputs, start_idx_input, end_idx_input)
+    gm, kv_cache_for_graph = insert_kv_slicing_before_sdpa(gm, kv_inputs, start_idx_input, end_idx_input, is_causal_input)
 
     # Call the function to add KV as outputs
     logits_keys_values = add_kv_as_outputs(gm, kv_cache_for_graph)
 
@@ -97,7 +97,7 @@ def get_static_tensor(tensor: torch.Tensor):
     start_idx_input = add_graph_input(gm, "start_idx", torch.tensor(0))
     end_idx_input = add_graph_input(gm, "end_idx", torch.tensor(1))
 
-    # Get the max sequence length from the first key_cache node. The order of nodes is: input_ids, is_causal, key_cache1, value_cache1, key_cache2, value_cache2, ..
+    # Get the max sequence length from the first key_cache node. The order of input nodes is: input_ids, key_cache1, value_cache1, key_cache2, value_cache2, start_idx, end_idx
     input_nodes = [node for node in gm.graph.nodes if node.op == "placeholder"]
     # Get the third last input which should be the last value cache node and store the max_seq_len
     input_ids_meta = input_nodes[-3].meta["val"]
@@ -232,7 +232,8 @@ def insert_kv_slicing_before_sdpa(gm, incoming_keys_values: List[Tuple[torch.Ten
             sdpa_nodes.append(node)
     kv_cache_for_graph = []
     for idx, sdpa_node in enumerate(sdpa_nodes):
-        q_node, k_node, v_node = sdpa_node.args[:3]
+        assert len(sdpa_node.args) == 6, f"SDPA node should have 6 arguments but got {len(sdpa_node.args)} arguments"
+        q_node, k_node, v_node, attn_mask, dropout_p, is_causal = sdpa_node.args
         incoming_key, incoming_value = incoming_keys_values[idx]
         # For keys  
         new_current_key_node, new_incoming_key_cache_node = create_kv_cache_update_nodes(gm, sdpa_node, k_node, incoming_key, start_idx_input, end_idx_input)
@@ -243,9 +244,9 @@ def insert_kv_slicing_before_sdpa(gm, incoming_keys_values: List[Tuple[torch.Ten
         kv_cache_for_graph.extend([new_incoming_key_cache_node, new_incoming_value_cache_node])
 
         # Update the SDPA node arguments with current key and value nodes
-        sdpa_node.args = (q_node, new_current_key_node, new_current_value_node) + (None, is_causal_input) # + sdpa_node.args[3:]
+        sdpa_node.args = (q_node, new_current_key_node, new_current_value_node) + (attn_mask, dropout_p, is_causal_input)
 
-    kv_cache_for_graph.extend([k_node, v_node])
+    # kv_cache_for_graph.extend([k_node, v_node])
     return gm, kv_cache_for_graph