Skip to content

Commit 0932932

Browse files
committed
Fix instructblip qformer size mismatch and multi-images problem
1 parent 557a6a3 commit 0932932

File tree

1 file changed

+8
-1
lines changed

1 file changed

+8
-1
lines changed

lmms_eval/models/instructblip.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from lmms_eval.api.instance import Instance
77
from lmms_eval.api.model import lmms
88
from lmms_eval.api.registry import register_model
9+
from lmms_eval.tasks.mmmu.utils_group_img import process_images
910
from accelerate import Accelerator, DistributedType
1011
from accelerate.state import AcceleratorState
1112
from typing import List, Optional, Union, Tuple
@@ -187,7 +188,13 @@ def _collate(x):
187188
if "<image>" in context:
188189
# instruct blip does not expect the <image> tag
189190
context = context.replace("<image>", "")
190-
inputs = self._image_processor(images=visuals, text=context, return_tensors="pt").to(self.device)
191+
# Set trunction equals true here, the max length for qformer tokenizer is 512
192+
# if not truncate, some questions will cause size mismatch
193+
# The transformer implementation can't handle multi images for blip
194+
# Concat it into one image
195+
if len(visuals) > 1:
196+
visuals = [process_images(visuals)]
197+
inputs = self._image_processor(images=visuals, text=context, return_tensors="pt", truncation=True).to(self.device)
191198

192199
gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))]
193200
if "max_new_tokens" not in gen_kwargs:

0 commit comments

Comments
 (0)