diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 704bc6b7324d..21982b01b9cc 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -19,6 +19,7 @@ docker run --privileged --net host --shm-size=16G -it \ vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ && python3 -m pip install pytest pytest-asyncio tpu-info \ && python3 -m pip install lm_eval[api]==0.4.4 \ + && export VLLM_XLA_CACHE_PATH= \ && export VLLM_USE_V1=1 \ && export VLLM_XLA_CHECK_RECOMPILATION=1 \ && echo HARDWARE \ diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index fc8e271f7f91..1e4a80539972 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -13,6 +13,7 @@ from vllm.entrypoints.llm import LLM from vllm.outputs import RequestOutput +from vllm.platforms import current_platform from vllm.sampling_params import GuidedDecodingParams, SamplingParams PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [ @@ -63,10 +64,13 @@ def test_structured_output( ): monkeypatch.setenv("VLLM_USE_V1", "1") + # Don't use eager execution on TPUs because we want to test for no + # recompilation at runtime + enforce_eager = bool(not current_platform.is_tpu()) # Use a single LLM instance for several scenarios to # speed up the test suite. llm = LLM(model=model_name, - enforce_eager=True, + enforce_eager=enforce_eager, max_model_len=1024, guided_decoding_backend=guided_decoding_backend, tokenizer_mode=tokenizer_mode) diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py index 046d3e404e4f..c6b492b5a3cc 100644 --- a/tests/v1/tpu/test_sampler.py +++ b/tests/v1/tpu/test_sampler.py @@ -23,7 +23,7 @@ def test_sampler_different(model_name: str): different results. """ llm = LLM(model_name, - enforce_eager=True, + enforce_eager=False, max_num_seqs=1, max_model_len=512, max_num_batched_tokens=512)