|
6 | 6 | from lmms_eval.api.instance import Instance |
7 | 7 | from lmms_eval.api.model import lmms |
8 | 8 | from lmms_eval.api.registry import register_model |
| 9 | +from lmms_eval.tasks.mmmu.utils_group_img import process_images |
9 | 10 | from accelerate import Accelerator, DistributedType |
10 | 11 | from accelerate.state import AcceleratorState |
11 | 12 | from typing import List, Optional, Union, Tuple |
@@ -187,7 +188,13 @@ def _collate(x): |
187 | 188 | if "<image>" in context: |
188 | 189 | # instruct blip does not expect the <image> tag |
189 | 190 | context = context.replace("<image>", "") |
190 | | - inputs = self._image_processor(images=visuals, text=context, return_tensors="pt").to(self.device) |
| 191 | + # Set trunction equals true here, the max length for qformer tokenizer is 512 |
| 192 | + # if not truncate, some questions will cause size mismatch |
| 193 | + # The transformer implementation can't handle multi images for blip |
| 194 | + # Concat it into one image |
| 195 | + if len(visuals) > 1: |
| 196 | + visuals = [process_images(visuals)] |
| 197 | + inputs = self._image_processor(images=visuals, text=context, return_tensors="pt", truncation=True).to(self.device) |
191 | 198 |
|
192 | 199 | gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))] |
193 | 200 | if "max_new_tokens" not in gen_kwargs: |
|
0 commit comments