llamastack · luis5tb · Oct 3, 2025
@@ -4238,7 +4238,6 @@
                     },
                     "max_tokens": {
                         "type": "integer",
-                        "default": 0,
                         "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                     },
                     "repetition_penalty": {

@@ -3087,7 +3087,6 @@ components:
           description: The sampling strategy.
         max_tokens:
           type: integer
-          default: 0
           description: >-
             The maximum number of tokens that can be generated in the completion.
             The token count of your prompt plus max_tokens cannot exceed the model's

@@ -2713,7 +2713,6 @@
                     },
                     "max_tokens": {
                         "type": "integer",
-                        "default": 0,
                         "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                     },
                     "repetition_penalty": {

@@ -1927,7 +1927,6 @@ components:
           description: The sampling strategy.
         max_tokens:
           type: integer
-          default: 0
           description: >-
             The maximum number of tokens that can be generated in the completion.
             The token count of your prompt plus max_tokens cannot exceed the model's

@@ -15472,7 +15472,6 @@
                     },
                     "max_tokens": {
                         "type": "integer",
-                        "default": 0,
                         "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                     },
                     "repetition_penalty": {

@@ -11541,7 +11541,6 @@ components:
           description: The sampling strategy.
         max_tokens:
           type: integer
-          default: 0
           description: >-
             The maximum number of tokens that can be generated in the completion.
             The token count of your prompt plus max_tokens cannot exceed the model's

@@ -96,7 +96,7 @@ class SamplingParams(BaseModel):
 
     strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy)
 
-    max_tokens: int | None = 0
+    max_tokens: int | None = None
     repetition_penalty: float | None = 1.0
     stop: list[str] | None = None
 

@@ -55,6 +55,7 @@ def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id):
                 "model": text_model_id,
                 "sampling_params": {
                     "temperature": 0.0,
+                    "max_tokens": 512,
                 },
             },
         },
@@ -88,6 +89,7 @@ def test_evaluate_benchmark(llama_stack_client, text_model_id, scoring_fn_id):
                 "model": text_model_id,
                 "sampling_params": {
                     "temperature": 0.0,
+                    "max_tokens": 512,
                 },
             },
         },