diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 0000b09bfaa3..2ca4f539b7ef 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -131,7 +131,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Execute a forward pass with dummy inputs to profile the memory usage # of the model. self.model_runner.profile_run() - torch.cuda.synchronize() free_gpu_memory, _ = torch.cuda.mem_get_info() # NOTE(woosuk): Here we assume that the other processes using the same diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index f51b51d433d3..9977ce2c7224 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -200,7 +200,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: weights_memory_in_bytes=self.model_runner. model_memory_usage) as result: self.model_runner.profile_run() - torch.cuda.synchronize() self._assert_memory_footprint_increased_during_profiling()