diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py index 35df6011550f..7a57f29a07fa 100644 --- a/examples/offline_inference/torchrun_example.py +++ b/examples/offline_inference/torchrun_example.py @@ -23,10 +23,14 @@ # Use `distributed_executor_backend="external_launcher"` so that # this llm engine/instance only creates one worker. +# it is important to set an explicit seed to make sure that +# all ranks have the same random seed, so that sampling can be +# deterministic across ranks. llm = LLM( model="facebook/opt-125m", tensor_parallel_size=2, distributed_executor_backend="external_launcher", + seed=0, ) outputs = llm.generate(prompts, sampling_params) diff --git a/vllm/config.py b/vllm/config.py index 1255d716a2e4..69cde4e362c8 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -761,6 +761,12 @@ def verify_with_parallel_config( self, parallel_config: "ParallelConfig", ) -> None: + + if parallel_config.distributed_executor_backend == "external_launcher": + assert self.seed is not None, ( + "Seed must be set when using external launcher backend to " + "make sure sampling results are the same across workers.") + total_num_attention_heads = getattr(self.hf_text_config, "num_attention_heads", 0) tensor_parallel_size = parallel_config.tensor_parallel_size