EvolvingLMMs-Lab
diff --git a/‎docs/README.md‎
Lines changed: 2 additions & 1 deletion b/‎docs/README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lmms_eval/tasks/llava_wilder/llava_wilder_full.yaml‎
Lines changed: 0 additions & 14 deletions b/‎lmms_eval/tasks/llava_wilder/llava_wilder_full.yaml‎
Lines changed: 0 additions & 14 deletions
diff --git a/‎lmms_eval/tasks/llava_wilder/llava_wilder_medium.yaml‎
Lines changed: 0 additions & 14 deletions b/‎lmms_eval/tasks/llava_wilder/llava_wilder_medium.yaml‎
Lines changed: 0 additions & 14 deletions
diff --git a/‎lmms_eval/tasks/llava_wilder/llava_wilder_small.yaml‎
Lines changed: 2 additions & 3 deletions b/‎lmms_eval/tasks/llava_wilder/llava_wilder_small.yaml‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎lmms_eval/tasks/llava_wilder/utils.py‎
Lines changed: 4 additions & 53 deletions b/‎lmms_eval/tasks/llava_wilder/utils.py‎
Lines changed: 4 additions & 53 deletions
diff --git a/‎lmms_eval/tasks/videomme/utils.py‎
100755100644
Lines changed: 114 additions & 9 deletions b/‎lmms_eval/tasks/videomme/utils.py‎
100755100644
Lines changed: 114 additions & 9 deletions
diff --git a/‎lmms_eval/tasks/videomme/videomme.yaml‎
100755100644 b/‎lmms_eval/tasks/videomme/videomme.yaml‎
100755100644
diff --git a/‎lmms_eval/tasks/videomme/videomme_w_subtitle.yaml‎
Lines changed: 44 additions & 0 deletions b/‎lmms_eval/tasks/videomme/videomme_w_subtitle.yaml‎
Lines changed: 44 additions & 0 deletions
@@ -8,4 +8,5 @@ Majority of this documentation is adapted from [lm-eval-harness](https://github.
 
 * To learn about the command line flags, see the [commands](commands.md)
 * To learn how to add a new moddel,  see the [Model Guide](model_guide.md).
-* For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md).
+* For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md).
+* If you need to upload your datasets into correct HF format with viewer supported, please refer to [tools](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/pufanyi/hf_dataset_docs/tools)
@@ -1,9 +1,8 @@
-dataset_path: lmms-lab/llava-wilder
-dataset_name: Small
+dataset_path: lmms-lab/llava-bench-wilder
 dataset_kwargs:
   token: True
 task: "llava_wilder_small"
-test_split: train 
+test_split: small
 model_specific_prompt_kwargs:
   default:
     pre_prompt: ""
 
@@ -13,17 +13,6 @@
 # Set up a logger
 from loguru import logger as eval_logger
 
-# Create a static variable to track if the message has been logged
-if not hasattr(eval_logger, "dashcope_warning_logged"):
-    eval_logger.dashcope_warning_logged = False
-
-try:
-    import dashscope
-except ImportError:
-    if not eval_logger.dashcope_warning_logged:
-        eval_logger.debug("Dashcope not found, make sure you install dashscope to use qwen vl")
-        eval_logger.dashcope_warning_logged = True
-
 NUM_SECONDS_TO_SLEEP = 5
 dir_path = os.path.dirname(os.path.realpath(__file__))
 
@@ -58,14 +47,6 @@
         "Content-Type": "application/json",
     }
 
-elif API_TYPE == "qwen_vl":
-    API_URL = os.getenv("QWEN_ENDPOINT", "https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation")
-    API_KEY = os.getenv("DASHSCOPE_API_KEY", "YOUR_API_KEY")
-    headers = {
-        "Authorization": f"Bearer {API_KEY}",
-        "Content-Type": "application/json",
-    }
-
 
 def get_chat_response(base64_image, prompt, max_retries=5, wait_time=10):
     headers = {
@@ -114,29 +95,6 @@ def image_to_base64(pil_image):
     return base64.b64encode(buffered.getvalue()).decode("utf-8")
 
 
-def qwen_multimodal_conversation_call(text_content, image_content, retries=5):
-    """Simple single round multimodal conversation call."""
-    messages = [{"role": "user", "content": [{"image": image_content}, {"text": text_content}]}]
-    for attempt in range(retries):
-        try:
-            response_data = dashscope.MultiModalConversation.call(model=GPT_EVAL_MODEL_NAME, messages=messages)
-            # The response status_code is HTTPStatus.OK indicate success,
-            # otherwise indicate request is failed, you can get error code
-            # and message from code and message.
-            content = response_data["output"]["choices"][0]["message"]["content"][0]["text"].strip()
-            if content != "":
-                return content, GPT_EVAL_MODEL_NAME
-            break  # If successful, break out of the loop
-        except Exception as e:
-            eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
-            if attempt < retries:  # If we have retries left, sleep and then continue to next attempt
-                time.sleep(NUM_SECONDS_TO_SLEEP)
-            else:  # If this was the last attempt, log and return empty
-                eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
-                return "", ""
-    return "", ""
-
-
 def parse_score(review):
     try:
         score_pair = review.split("\n")[0]
@@ -162,20 +120,13 @@ def llava_process_results(doc, result):
     """
     try:
         question = doc.get("question", "")
-        ans1 = doc.get("gpt4v_answer", "")
+        ans1 = doc.get("answer", "")
         ans2 = result[0] if result else ""
         content = f"[Question]\n{question}\n\n" + f"[Assistant 1]\n{ans1}\n\n[End of Assistant 1]\n\n" + f"[Assistant 2]\n{ans2}\n\n[End of Assistant 2]\n\n" f"[System]\n{judge_rules}\n\n"
         visuals = llava_doc_to_visual(doc)
-        if API_TYPE == "qwen_vl":
-            file_path = os.path.join(dir_path, f"tmp_{doc['question_id']}.jpg")
-            visuals[0].save(file_path)
-            image_content = "file://" + file_path
-            review, model_name = qwen_multimodal_conversation_call(content, image_content=image_content)
-            os.remove(file_path)
-        elif API_TYPE == "openai":
-            image_path = doc["image"]
-            base64_image = image_to_base64(image_path)
-            review, model_name = get_chat_response(base64_image, content)
+        image_path = doc["image"]
+        base64_image = image_to_base64(image_path)
+        review, model_name = get_chat_response(base64_image, content)
         scores = parse_score(review)
     except Exception as e:
         eval_logger.error(f"Error for Question ID: {doc.get('question_id', 'Unknown')}: {e}")
 
@@ -10,6 +10,8 @@
 import sys
 from typing import List, Dict, Optional, Union
 import re
+import cv2
+import numpy as np
 
 from loguru import logger as eval_logger
 
@@ -80,17 +82,55 @@
 # cache_dir = os.path.join(hf_home, cache_dir)
 # base_cache_dir = config["dataset_kwargs"]["cache_dir"]
 base_cache_dir = os.path.expanduser(hf_home)
-
+with open(Path(__file__).parent / "videomme.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
+
+
+def parse_subtitle_time(time_str):
+    h, m, s_ms = time_str.split(':')
+    s, ms = s_ms.split(',')
+    return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000
+
+def load_subtitles(subtitle_path):
+    subtitles = {}
+    with open(subtitle_path, 'r', encoding='utf-8') as file:
+        content = file.read().split('\n\n')
+        for section in content:
+            if section.strip():
+                lines = section.split('\n')
+                if len(lines) >= 3:
+                    time_range = lines[1].split(' --> ')
+                    start_time = parse_subtitle_time(time_range[0])
+                    end_time = parse_subtitle_time(time_range[1])
+                    text = ' '.join(line for line in lines[2:])
+                    subtitles[(start_time, end_time)] = text
+    return subtitles
+
+def convert_time_to_frame(time_in_seconds, fps):
+    return int(time_in_seconds * fps)
+
+def extract_subtitles(video_path, subtitle_path):
+    video = cv2.VideoCapture(video_path)
+    fps = video.get(cv2.CAP_PROP_FPS)
+    total_frame=int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    subtitles = load_subtitles(subtitle_path)
+    
+    subtitle_frames = []
+    for (start_time, end_time), text in subtitles.items():
+        start_frame = convert_time_to_frame(start_time, fps)
+        end_frame = convert_time_to_frame(end_time, fps)
+        subtitle_frames.append((start_frame, end_frame, text))
+
+    return subtitle_frames,total_frame
 
 def videomme_doc_to_visual(doc):
-    with open(Path(__file__).parent / "videomme.yaml", "r") as f:
-        raw_data = f.readlines()
-        safe_data = []
-        for i, line in enumerate(raw_data):
-            # remove function definition since yaml load cannot handle it
-            if "!function" not in line:
-                safe_data.append(line)
-    cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
+
     cache_dir = os.path.join(base_cache_dir, cache_name)
     video_path = doc["videoID"] + ".mp4"
     video_path = os.path.join(cache_dir, video_path)
@@ -106,6 +146,71 @@ def videomme_doc_to_visual(doc):
 
 
 def videomme_doc_to_text(doc, model_specific_prompt_kwargs=None):
+    option_prompt="Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."
+    question = doc["question"]
+    option = str(doc["options"])
+    question = question + "\n" + option
+    full_prompt=option_prompt+"\n"+question+"\n"+"The best answer is:"
+    return full_prompt
+# Frames + Subs
+# This video's subtitles are listed below: 
+# 【subtitles】
+
+# Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option.
+# 【question】
+# The best answer is:
+# Frames / Frames + Audio
+# Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.
+# 【question】
+# The best answer is:
+
+def videomme_doc_to_text_subtitle(doc, model_specific_prompt_kwargs=None):
+    cache_dir = os.path.join(base_cache_dir, cache_name)
+    video_path = doc["videoID"] + ".mp4"
+    subtitle_path=os.path.join(cache_dir,"subtitle",doc["videoID"]+".srt")
+    video_path = os.path.join(cache_dir, video_path)
+    if os.path.exists(subtitle_path): #Denote have subtitle
+        subtitle=open(subtitle_path).readlines()
+    else:
+        subtitle=""
+    subtitles_prompt="This video's subtitles are listed below: \n"
+    if subtitle=="":
+        subtitle="No subtitles available"
+    else:
+        if "gemini_api_flag" in model_specific_prompt_kwargs: #specific for gemini_api
+            if model_specific_prompt_kwargs['gemini_api_flag']=="full subtitle":
+                textlist=[]
+                for ele in subtitle:
+                    pattern = r'<font color="white" size=".72c">(.*?)</font>'
+                    matches = re.findall(pattern, ele)
+                    if matches:
+                        textlist.append(matches[0])
+                subtitle_text="\n".join(textlist)
+        else:
+            if "frame_num" in model_specific_prompt_kwargs:
+                frame_num=model_specific_prompt_kwargs['frame_num']
+                subtitle_by_frame,total_frame=extract_subtitles(video_path,subtitle_path)
+                uniform_sampled_frames = np.linspace(0, total_frame - 1, frame_num, dtype=int).tolist()
+                
+                subtitle_by_frame_idx=[]
+                for frame_idx in uniform_sampled_frames:
+                    for idx,title in enumerate(subtitle_by_frame):
+                        if frame_idx<title[1] and frame_idx>=title[0]:
+                            subtitle_by_frame_idx.append(idx)
+                subtitle_by_frame_idx=list(set(subtitle_by_frame_idx))
+
+                textlist=[]
+                for idx in subtitle_by_frame_idx:
+                    pattern = r'<font color="white" size=".72c">(.*?)</font>'
+                    raw_text=re.findall(pattern, subtitle_by_frame[idx][2])
+                    try:
+                        textlist.append(raw_text[0])
+                    except:
+                        continue
+                subtitle_text="\n".join(textlist)
+        subtitle=subtitle_text
+
+    option_prompt="Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."
     question = doc["question"]
     option = str(doc["options"])
     question = question + "\n" + option + model_specific_prompt_kwargs["post_prompt"]
 
@@ -0,0 +1,44 @@
+dataset_path: lmms-lab/Video-MME
+dataset_kwargs:
+  token: True
+  cache_dir: videomme
+  video: True
+  # From_YouTube: True
+task: videomme_w_subtitle
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.videomme_doc_to_visual
+doc_to_text: !function utils.videomme_doc_to_text_subtitle
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+# The return value of process_results will be used by metrics
+process_results: !function utils.videomme_process_results
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: videomme_percetion_score
+    aggregation: !function utils.videomme_aggregate_results
+    higher_is_better: true
+model_specific_prompt_kwargs:
+  default:
+    frame_num: 32
+  gemini_api:
+    gemini_api_flag: "full subtitle"
+  # gpt4v:
+  #   pre_prompt: ""
+  #   post_prompt: 
+  # # qwen_vl:  
+  # #   pre_prompt: ""
+  # #   post_prompt: " Answer:"
+  # # otterhd:
+  # #   pre_prompt: ""
+  # #   post_prompt: " Answer:"
+  # xcomposer2_4khd:
+  #   pre_prompt: "[UNUSED_TOKEN_146]user\n"
+  #   post_prompt: " Answer this question with A, B, C, or D.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n"
+metadata:
+  - version: 0.0