Skip to content

Commit 39d40de

Browse files
authored
Merge pull request #129 from Dannoopsy/mmbench_ru
add task MMBench-ru
2 parents e19b43a + ba7081c commit 39d40de

File tree

4 files changed

+163
-0
lines changed

4 files changed

+163
-0
lines changed
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
dataset_path: deepvk/MMBench-ru
2+
dataset_kwargs:
3+
token: True
4+
doc_to_target: "answer"
5+
model_specific_prompt_kwargs:
6+
default:
7+
pre_prompt: ""
8+
post_prompt: "\nВыбери правильный вариант ответа буквой."
9+
doc_to_visual: !function ru_utils.mmbench_doc_to_visual
10+
doc_to_text: !function ru_utils.mmbench_doc_to_text
11+
doc_to_target: "answer"
12+
process_results: !function ru_utils.mmbench_process_results
13+
model_specific_generation_kwargs:
14+
llava:
15+
image_aspect_ratio: original
16+
output_type: generate_until
17+
generation_kwargs:
18+
until:
19+
- "ASSISTANT:"
20+
max_new_tokens: 1024
21+
temperature: 0
22+
top_p: 1.0
23+
num_beams: 1
24+
do_sample: false

lmms_eval/tasks/mmbench/mmbench.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ task:
55
- mmbench_cn_dev
66
- mmbench_cn_test
77
- mmbench_cn_cc
8+
- mmbench_ru_dev
89
metadata:
910
version: 0.0
1011
sys_prompt: "There are several options:"
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
task: "mmbench_ru_dev"
2+
test_split: dev
3+
include: _default_template_mmbench_ru_yaml
4+
metric_list:
5+
- metric: gpt_eval_score
6+
aggregation: !function ru_utils.mmbench_aggregate_dev_results_eval
7+
higher_is_better: true
8+
- metric: submission
9+
aggregation: !function ru_utils.mmbench_aggregate_dev_results_submission
10+
higher_is_better: true
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
import yaml
2+
import os
3+
from pathlib import Path
4+
import pandas as pd
5+
import json
6+
7+
from loguru import logger as eval_logger
8+
from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
9+
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
10+
11+
with open(Path(__file__).parent / "mmbench.yaml", "r") as f:
12+
raw_data = f.readlines()
13+
safe_data = []
14+
for i, line in enumerate(raw_data):
15+
# remove function definition since yaml load cannot handle it
16+
if "!function" not in line:
17+
safe_data.append(line)
18+
19+
config = yaml.safe_load("".join(safe_data))
20+
21+
GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
22+
API_TYPE = os.getenv("API_TYPE", "openai")
23+
24+
if API_TYPE == "openai":
25+
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
26+
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
27+
elif API_TYPE == "azure":
28+
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
29+
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
30+
else:
31+
API_URL = "YOUR_API_URL"
32+
API_KEY = "YOUR_API_KEY"
33+
34+
35+
mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)
36+
37+
38+
def mmbench_doc_to_visual(doc):
39+
return [doc["image"].convert("RGB")]
40+
41+
42+
def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):
43+
option_candidate = ["A", "B", "C", "D", "E"]
44+
options_prompt, options_dict = mmbench_evaluator.create_options_prompt(doc, option_candidate)
45+
46+
data = {
47+
# "img": doc["image"],
48+
"question": doc["question"],
49+
"answer": doc.get("answer", None),
50+
"options": options_prompt,
51+
"category": doc["category"],
52+
"L2-category": doc["l2-category"],
53+
"options_dict": options_dict,
54+
"index": doc["index"],
55+
"hint": doc["hint"],
56+
"source": doc["source"],
57+
"split": doc["split"],
58+
}
59+
60+
query_prompt = f"{data['hint']} {data['question']} {data['options']}" if pd.notna(data["hint"]) and data["hint"] != "nan" else f"{data['question']} {data['options']}"
61+
62+
if model_specific_prompt_kwargs:
63+
query_prompt = f"{query_prompt}\n{model_specific_prompt_kwargs['post_prompt']}"
64+
65+
return query_prompt
66+
67+
68+
def mmbench_process_results(doc, results):
69+
model_response = results[0].strip()
70+
data = {
71+
"gpt_eval_score": {
72+
"index": doc["index"],
73+
"question": doc["question"],
74+
"answer": doc["answer"],
75+
"prediction": model_response,
76+
"hint": doc["hint"],
77+
"source": doc["source"],
78+
"split": doc["split"],
79+
"category": doc["category"],
80+
"L2-category": doc["l2-category"],
81+
},
82+
"submission": {
83+
"index": doc["index"],
84+
"question": doc["question"],
85+
"answer": doc["answer"],
86+
"prediction": model_response,
87+
"hint": doc["hint"],
88+
"source": doc["source"],
89+
"split": doc["split"],
90+
"category": doc["category"],
91+
"L2-category": doc["l2-category"],
92+
},
93+
}
94+
option_candidate = ["A", "B", "C", "D", "E"]
95+
for c in option_candidate:
96+
data["submission"][c] = doc.get(c, "nan")
97+
data["gpt_eval_score"][c] = doc.get(c, "nan")
98+
return data
99+
100+
101+
def mmbench_aggregate_dev_results_eval(results, args):
102+
print(f"============= MMBench-RU(Dev) Detailed Results =============")
103+
overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai")
104+
file = generate_submission_file("mmbench_ru_dev_results.json", args)
105+
details_info = {
106+
"overall_acc": overall_acc,
107+
"category_acc": category_acc,
108+
"l2_category_acc": l2_category_acc,
109+
}
110+
with open(file, "w") as f:
111+
json.dump(details_info, f)
112+
return overall_acc * 100
113+
114+
115+
def mmbench_aggregate_dev_results_submission(results, args):
116+
df = pd.DataFrame(results)
117+
excel_write_path = generate_submission_file("mmbench_ru_dev_results.xlsx", args)
118+
with pd.ExcelWriter(excel_write_path) as writer:
119+
df.to_excel(writer, index=False)
120+
eval_logger.info(f"Saved results to {excel_write_path}")
121+
122+
123+
def mmbench_aggregate_test_results(results, args):
124+
df = pd.DataFrame(results)
125+
excel_write_path = generate_submission_file("mmbench_ru_test_results.xlsx", args)
126+
with pd.ExcelWriter(excel_write_path) as writer:
127+
df.to_excel(writer, index=False)
128+
eval_logger.info(f"Saved results to {excel_write_path}")

0 commit comments

Comments
 (0)