Skip to content

Commit d78ec86

Browse files
authored
Merge branch 'main' into dev/interleave
2 parents ebe7217 + fce85f1 commit d78ec86

File tree

10 files changed

+458
-114
lines changed

10 files changed

+458
-114
lines changed

docs/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@ Majority of this documentation is adapted from [lm-eval-harness](https://github.
88

99
* To learn about the command line flags, see the [commands](commands.md)
1010
* To learn how to add a new moddel, see the [Model Guide](model_guide.md).
11-
* For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md).
11+
* For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md).
12+
* If you need to upload your datasets into correct HF format with viewer supported, please refer to [tools](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/pufanyi/hf_dataset_docs/tools)

lmms_eval/tasks/llava_wilder/llava_wilder_full.yaml

Lines changed: 0 additions & 14 deletions
This file was deleted.

lmms_eval/tasks/llava_wilder/llava_wilder_medium.yaml

Lines changed: 0 additions & 14 deletions
This file was deleted.

lmms_eval/tasks/llava_wilder/llava_wilder_small.yaml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
1-
dataset_path: lmms-lab/llava-wilder
2-
dataset_name: Small
1+
dataset_path: lmms-lab/llava-bench-wilder
32
dataset_kwargs:
43
token: True
54
task: "llava_wilder_small"
6-
test_split: train
5+
test_split: small
76
model_specific_prompt_kwargs:
87
default:
98
pre_prompt: ""

lmms_eval/tasks/llava_wilder/utils.py

Lines changed: 4 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,6 @@
1313
# Set up a logger
1414
from loguru import logger as eval_logger
1515

16-
# Create a static variable to track if the message has been logged
17-
if not hasattr(eval_logger, "dashcope_warning_logged"):
18-
eval_logger.dashcope_warning_logged = False
19-
20-
try:
21-
import dashscope
22-
except ImportError:
23-
if not eval_logger.dashcope_warning_logged:
24-
eval_logger.debug("Dashcope not found, make sure you install dashscope to use qwen vl")
25-
eval_logger.dashcope_warning_logged = True
26-
2716
NUM_SECONDS_TO_SLEEP = 5
2817
dir_path = os.path.dirname(os.path.realpath(__file__))
2918

@@ -58,14 +47,6 @@
5847
"Content-Type": "application/json",
5948
}
6049

61-
elif API_TYPE == "qwen_vl":
62-
API_URL = os.getenv("QWEN_ENDPOINT", "https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation")
63-
API_KEY = os.getenv("DASHSCOPE_API_KEY", "YOUR_API_KEY")
64-
headers = {
65-
"Authorization": f"Bearer {API_KEY}",
66-
"Content-Type": "application/json",
67-
}
68-
6950

7051
def get_chat_response(base64_image, prompt, max_retries=5, wait_time=10):
7152
headers = {
@@ -114,29 +95,6 @@ def image_to_base64(pil_image):
11495
return base64.b64encode(buffered.getvalue()).decode("utf-8")
11596

11697

117-
def qwen_multimodal_conversation_call(text_content, image_content, retries=5):
118-
"""Simple single round multimodal conversation call."""
119-
messages = [{"role": "user", "content": [{"image": image_content}, {"text": text_content}]}]
120-
for attempt in range(retries):
121-
try:
122-
response_data = dashscope.MultiModalConversation.call(model=GPT_EVAL_MODEL_NAME, messages=messages)
123-
# The response status_code is HTTPStatus.OK indicate success,
124-
# otherwise indicate request is failed, you can get error code
125-
# and message from code and message.
126-
content = response_data["output"]["choices"][0]["message"]["content"][0]["text"].strip()
127-
if content != "":
128-
return content, GPT_EVAL_MODEL_NAME
129-
break # If successful, break out of the loop
130-
except Exception as e:
131-
eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
132-
if attempt < retries: # If we have retries left, sleep and then continue to next attempt
133-
time.sleep(NUM_SECONDS_TO_SLEEP)
134-
else: # If this was the last attempt, log and return empty
135-
eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
136-
return "", ""
137-
return "", ""
138-
139-
14098
def parse_score(review):
14199
try:
142100
score_pair = review.split("\n")[0]
@@ -162,20 +120,13 @@ def llava_process_results(doc, result):
162120
"""
163121
try:
164122
question = doc.get("question", "")
165-
ans1 = doc.get("gpt4v_answer", "")
123+
ans1 = doc.get("answer", "")
166124
ans2 = result[0] if result else ""
167125
content = f"[Question]\n{question}\n\n" + f"[Assistant 1]\n{ans1}\n\n[End of Assistant 1]\n\n" + f"[Assistant 2]\n{ans2}\n\n[End of Assistant 2]\n\n" f"[System]\n{judge_rules}\n\n"
168126
visuals = llava_doc_to_visual(doc)
169-
if API_TYPE == "qwen_vl":
170-
file_path = os.path.join(dir_path, f"tmp_{doc['question_id']}.jpg")
171-
visuals[0].save(file_path)
172-
image_content = "file://" + file_path
173-
review, model_name = qwen_multimodal_conversation_call(content, image_content=image_content)
174-
os.remove(file_path)
175-
elif API_TYPE == "openai":
176-
image_path = doc["image"]
177-
base64_image = image_to_base64(image_path)
178-
review, model_name = get_chat_response(base64_image, content)
127+
image_path = doc["image"]
128+
base64_image = image_to_base64(image_path)
129+
review, model_name = get_chat_response(base64_image, content)
179130
scores = parse_score(review)
180131
except Exception as e:
181132
eval_logger.error(f"Error for Question ID: {doc.get('question_id', 'Unknown')}: {e}")

lmms_eval/tasks/videomme/utils.py

100755100644
Lines changed: 114 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import sys
1111
from typing import List, Dict, Optional, Union
1212
import re
13+
import cv2
14+
import numpy as np
1315

1416
from loguru import logger as eval_logger
1517

@@ -80,17 +82,55 @@
8082
# cache_dir = os.path.join(hf_home, cache_dir)
8183
# base_cache_dir = config["dataset_kwargs"]["cache_dir"]
8284
base_cache_dir = os.path.expanduser(hf_home)
83-
85+
with open(Path(__file__).parent / "videomme.yaml", "r") as f:
86+
raw_data = f.readlines()
87+
safe_data = []
88+
for i, line in enumerate(raw_data):
89+
# remove function definition since yaml load cannot handle it
90+
if "!function" not in line:
91+
safe_data.append(line)
92+
cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
93+
94+
95+
def parse_subtitle_time(time_str):
96+
h, m, s_ms = time_str.split(':')
97+
s, ms = s_ms.split(',')
98+
return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000
99+
100+
def load_subtitles(subtitle_path):
101+
subtitles = {}
102+
with open(subtitle_path, 'r', encoding='utf-8') as file:
103+
content = file.read().split('\n\n')
104+
for section in content:
105+
if section.strip():
106+
lines = section.split('\n')
107+
if len(lines) >= 3:
108+
time_range = lines[1].split(' --> ')
109+
start_time = parse_subtitle_time(time_range[0])
110+
end_time = parse_subtitle_time(time_range[1])
111+
text = ' '.join(line for line in lines[2:])
112+
subtitles[(start_time, end_time)] = text
113+
return subtitles
114+
115+
def convert_time_to_frame(time_in_seconds, fps):
116+
return int(time_in_seconds * fps)
117+
118+
def extract_subtitles(video_path, subtitle_path):
119+
video = cv2.VideoCapture(video_path)
120+
fps = video.get(cv2.CAP_PROP_FPS)
121+
total_frame=int(video.get(cv2.CAP_PROP_FRAME_COUNT))
122+
subtitles = load_subtitles(subtitle_path)
123+
124+
subtitle_frames = []
125+
for (start_time, end_time), text in subtitles.items():
126+
start_frame = convert_time_to_frame(start_time, fps)
127+
end_frame = convert_time_to_frame(end_time, fps)
128+
subtitle_frames.append((start_frame, end_frame, text))
129+
130+
return subtitle_frames,total_frame
84131

85132
def videomme_doc_to_visual(doc):
86-
with open(Path(__file__).parent / "videomme.yaml", "r") as f:
87-
raw_data = f.readlines()
88-
safe_data = []
89-
for i, line in enumerate(raw_data):
90-
# remove function definition since yaml load cannot handle it
91-
if "!function" not in line:
92-
safe_data.append(line)
93-
cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
133+
94134
cache_dir = os.path.join(base_cache_dir, cache_name)
95135
video_path = doc["videoID"] + ".mp4"
96136
video_path = os.path.join(cache_dir, video_path)
@@ -106,6 +146,71 @@ def videomme_doc_to_visual(doc):
106146

107147

108148
def videomme_doc_to_text(doc, model_specific_prompt_kwargs=None):
149+
option_prompt="Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."
150+
question = doc["question"]
151+
option = str(doc["options"])
152+
question = question + "\n" + option
153+
full_prompt=option_prompt+"\n"+question+"\n"+"The best answer is:"
154+
return full_prompt
155+
# Frames + Subs
156+
# This video's subtitles are listed below:
157+
# 【subtitles】
158+
159+
# Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option.
160+
# 【question】
161+
# The best answer is:
162+
# Frames / Frames + Audio
163+
# Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.
164+
# 【question】
165+
# The best answer is:
166+
167+
def videomme_doc_to_text_subtitle(doc, model_specific_prompt_kwargs=None):
168+
cache_dir = os.path.join(base_cache_dir, cache_name)
169+
video_path = doc["videoID"] + ".mp4"
170+
subtitle_path=os.path.join(cache_dir,"subtitle",doc["videoID"]+".srt")
171+
video_path = os.path.join(cache_dir, video_path)
172+
if os.path.exists(subtitle_path): #Denote have subtitle
173+
subtitle=open(subtitle_path).readlines()
174+
else:
175+
subtitle=""
176+
subtitles_prompt="This video's subtitles are listed below: \n"
177+
if subtitle=="":
178+
subtitle="No subtitles available"
179+
else:
180+
if "gemini_api_flag" in model_specific_prompt_kwargs: #specific for gemini_api
181+
if model_specific_prompt_kwargs['gemini_api_flag']=="full subtitle":
182+
textlist=[]
183+
for ele in subtitle:
184+
pattern = r'<font color="white" size=".72c">(.*?)</font>'
185+
matches = re.findall(pattern, ele)
186+
if matches:
187+
textlist.append(matches[0])
188+
subtitle_text="\n".join(textlist)
189+
else:
190+
if "frame_num" in model_specific_prompt_kwargs:
191+
frame_num=model_specific_prompt_kwargs['frame_num']
192+
subtitle_by_frame,total_frame=extract_subtitles(video_path,subtitle_path)
193+
uniform_sampled_frames = np.linspace(0, total_frame - 1, frame_num, dtype=int).tolist()
194+
195+
subtitle_by_frame_idx=[]
196+
for frame_idx in uniform_sampled_frames:
197+
for idx,title in enumerate(subtitle_by_frame):
198+
if frame_idx<title[1] and frame_idx>=title[0]:
199+
subtitle_by_frame_idx.append(idx)
200+
subtitle_by_frame_idx=list(set(subtitle_by_frame_idx))
201+
202+
textlist=[]
203+
for idx in subtitle_by_frame_idx:
204+
pattern = r'<font color="white" size=".72c">(.*?)</font>'
205+
raw_text=re.findall(pattern, subtitle_by_frame[idx][2])
206+
try:
207+
textlist.append(raw_text[0])
208+
except:
209+
continue
210+
subtitle_text="\n".join(textlist)
211+
subtitle=subtitle_text
212+
213+
option_prompt="Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."
109214
question = doc["question"]
110215
option = str(doc["options"])
111216
question = question + "\n" + option + model_specific_prompt_kwargs["post_prompt"]

lmms_eval/tasks/videomme/videomme.yaml

100755100644
File mode changed.
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
dataset_path: lmms-lab/Video-MME
2+
dataset_kwargs:
3+
token: True
4+
cache_dir: videomme
5+
video: True
6+
# From_YouTube: True
7+
task: videomme_w_subtitle
8+
test_split: test
9+
output_type: generate_until
10+
doc_to_visual: !function utils.videomme_doc_to_visual
11+
doc_to_text: !function utils.videomme_doc_to_text_subtitle
12+
doc_to_target: "answer"
13+
generation_kwargs:
14+
max_new_tokens: 16
15+
temperature: 0
16+
top_p: 1.0
17+
num_beams: 1
18+
do_sample: false
19+
# The return value of process_results will be used by metrics
20+
process_results: !function utils.videomme_process_results
21+
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
22+
metric_list:
23+
- metric: videomme_percetion_score
24+
aggregation: !function utils.videomme_aggregate_results
25+
higher_is_better: true
26+
model_specific_prompt_kwargs:
27+
default:
28+
frame_num: 32
29+
gemini_api:
30+
gemini_api_flag: "full subtitle"
31+
# gpt4v:
32+
# pre_prompt: ""
33+
# post_prompt:
34+
# # qwen_vl:
35+
# # pre_prompt: ""
36+
# # post_prompt: " Answer:"
37+
# # otterhd:
38+
# # pre_prompt: ""
39+
# # post_prompt: " Answer:"
40+
# xcomposer2_4khd:
41+
# pre_prompt: "[UNUSED_TOKEN_146]user\n"
42+
# post_prompt: " Answer this question with A, B, C, or D.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n"
43+
metadata:
44+
- version: 0.0

0 commit comments

Comments
 (0)