Skip to content

Commit c76d74a

Browse files
committed
add II-Bench
1 parent 43fe1e1 commit c76d74a

File tree

2 files changed

+90
-0
lines changed

2 files changed

+90
-0
lines changed
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
dataset_path: m-a-p/II-Bench
2+
task: "ii-bench"
3+
test_split: test
4+
output_type: generate_until
5+
doc_to_visual: !function utils.ii_bench_doc_to_visual
6+
doc_to_text: !function utils.ii_bench_doc_to_text
7+
doc_to_target: "answers"
8+
generation_kwargs:
9+
max_new_tokens: 32
10+
temperature: 0
11+
do_sample: False
12+
process_results: !function utils.ii_bench_process_results
13+
metric_list:
14+
- metric: submission
15+
aggregation: !function utils.ii_bench_aggregate_submissions
16+
model_specific_prompt_kwargs:
17+
default:
18+
pre_prompt: "Instruction: Please try to answer the single-answer multiple choice question below based on the picture provided.\n"
19+
post_prompt: "\nAnswer:"
20+

lmms_eval/tasks/ii_bench/utils.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import json
2+
import logging
3+
import re
4+
from collections import Counter
5+
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
6+
7+
PROMPT = """Question: {}
8+
(A) {}
9+
(B) {}
10+
(C) {}
11+
(D) {}
12+
(E) {}
13+
(F) {}"""
14+
15+
def ii_bench_doc_to_text(doc, model_specific_prompt_kwargs):
16+
question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"])
17+
pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
18+
post_prompt = model_specific_prompt_kwargs["post_prompt"]
19+
return f"{pre_prompt}{question}{post_prompt}"
20+
21+
22+
def ii_bench_doc_to_visual(doc):
23+
return [doc["image"].convert("RGB")]
24+
25+
26+
def extract_option_labels(text, options=None):
27+
if isinstance(text, dict):
28+
return 'error'
29+
pattern = r"\(([A-F])\)"
30+
matches = re.findall(pattern, text)
31+
32+
if not matches:
33+
pattern = r"\b([A-F])\b"
34+
matches = re.findall(pattern, text)
35+
36+
if matches:
37+
counter = Counter(matches)
38+
most_common = counter.most_common()
39+
max_count = most_common[0][1]
40+
candidates = [item for item in most_common if item[1] == max_count]
41+
return candidates[-1][0]
42+
else:
43+
if options:
44+
counter = Counter()
45+
for i, option in enumerate(options, start=1):
46+
label = chr(64 + i)
47+
option_stripped = option.strip()
48+
if option_stripped in text:
49+
counter[label] += 1
50+
elif text in option:
51+
counter[label] += 1
52+
if counter:
53+
most_common = counter.most_common()
54+
max_count = most_common[0][1]
55+
candidates = [item for item in most_common if item[1] == max_count]
56+
return candidates[-1][0]
57+
return None
58+
59+
60+
def ii_bench_process_results(doc, results):
61+
response = results[0]
62+
predict = extract_option_labels(response, [doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"]])
63+
return {"submission": {"id": doc["id"], "predict_answer": predict, "response": response}}
64+
65+
66+
def ii_bench_aggregate_submissions(results, args):
67+
file = generate_submission_file("ii_bench_test_for_submission.json", args)
68+
with open(file, "w") as f:
69+
json.dump(results, f, indent=4)
70+
logging.getLogger("lmms-eval").info(f"Results saved to {file}")

0 commit comments

Comments
 (0)