diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 6981f2ce5623..a8645b75c8a1 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -121,14 +121,22 @@ def load_bytes(
         original_fps = cap.get(cv2.CAP_PROP_FPS)
         duration = total_frames_num / original_fps if original_fps > 0 else 0
 
+        validate_frames_list = []
+        for idx in range(total_frames_num):
+            ok = cap.grab()
+            if ok:
+                validate_frames_list.append(idx)
+
+        validate_total_frames_num = len(validate_frames_list)
+
         # resample video to target num_frames
         full_read = num_frames == -1 or total_frames_num < num_frames
         if full_read:
-            num_frames = total_frames_num
+            num_frames = validate_total_frames_num
             frame_idx = list(range(0, num_frames))
         else:
             uniform_sampled_frames = np.linspace(0,
-                                                 total_frames_num - 1,
+                                                 validate_total_frames_num - 1,
                                                  num_frames,
                                                  dtype=int)
             frame_idx = uniform_sampled_frames.tolist()
@@ -138,15 +146,21 @@ def load_bytes(
         frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8)
 
         i = 0
-        for idx in range(total_frames_num):
-            ok = cap.grab()
-            if not ok:
-                break
-            if idx in frame_idx:
-                ret, frame = cap.retrieve()
-                if ret:
-                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    i += 1
+        cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
+        validate_list_idx = frame_idx[i]
+        target_frame_pos = validate_frames_list[validate_list_idx]
+        for pos in range(total_frames_num):
+            cap.grab()
+            if target_frame_pos != pos:
+                continue
+            ret, frame = cap.retrieve()
+            if ret:
+                frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                i += 1
+                if i >= len(frame_idx):
+                    break
+                validate_list_idx = frame_idx[i]
+                target_frame_pos = validate_frames_list[validate_list_idx]
 
         assert i == num_frames, (f"Expected reading {num_frames} frames, "
                                  f"but only loaded {i} frames from video.")