EvolvingLMMs-Lab
diff --git a/‎lmms_eval/evaluator.py‎
Lines changed: 1 addition & 1 deletion b/‎lmms_eval/evaluator.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lmms_eval/models/claude.py‎
Lines changed: 1 addition & 0 deletions b/‎lmms_eval/models/claude.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lmms_eval/models/gemini_api.py‎
Lines changed: 3 additions & 1 deletion b/‎lmms_eval/models/gemini_api.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎lmms_eval/models/model_utils/load_video.py‎
Lines changed: 2 additions & 0 deletions b/‎lmms_eval/models/model_utils/load_video.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/live_bench/live_bench.yaml‎
Lines changed: 29 additions & 0 deletions b/‎lmms_eval/tasks/live_bench/live_bench.yaml‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎lmms_eval/tasks/live_bench/utils.py‎
Lines changed: 197 additions & 0 deletions b/‎lmms_eval/tasks/live_bench/utils.py‎
Lines changed: 197 additions & 0 deletions
diff --git a/‎tools/live_bench/create_dataset.py‎
Lines changed: 12 additions & 0 deletions b/‎tools/live_bench/create_dataset.py‎
Lines changed: 12 additions & 0 deletions
@@ -325,7 +325,7 @@ def evaluate(
             # hack: remove image columns to speed avoid loading images and speed up postprocessing
             # reason: doc_iterator will actually load image if it's in the doc.
             docs = task.test_docs() if task.has_test_docs() else task.validation_docs()
-            if "d170" not in task_name and "dc100" not in task_name and "dc200" not in task_name and "llava_wilder" not in task_name and "livebench" not in task_name:
+            if "d170" not in task_name and "dc100" not in task_name and "dc200" not in task_name and "llava_wilder" not in task_name and "live_bench" not in task_name:
                 remove_cols = []
                 features = docs.features
                 # If it is an Image instance or a Sequence of Image instance. Remove it
 
@@ -238,6 +238,7 @@ def generate_until(self, requests) -> List[str]:
                         pbar.update(1)
                         continue
 
+            response_text = message.content[0].text
             res.append(message.content[0].text)
             pbar.update(1)
 
 
@@ -31,7 +31,7 @@
 class GeminiAPI(lmms):
     def __init__(
         self,
-        model_version: str = "gemini-1.5-flash-latest",
+        model_version: str = "gemini-1.5-pro",
         modality: str = "image",
         timeout: int = 120,
         continual_mode: bool = False,
@@ -46,6 +46,8 @@ def __init__(
         if self.continual_mode and response_persistent_folder is None:
             raise ValueError("Continual mode requires a persistent path for the response. We will cache the Gemini API response in this path and use it for future requests. Please provide a valid path.")
         self.response_persistent_folder = response_persistent_folder
+        if not os.path.exists(self.response_persistent_folder):
+            os.makedirs(self.response_persistent_folder)
         self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")
 
         if os.path.exists(self.response_persistent_file):
 
@@ -29,6 +29,8 @@ def record_video_length_packet(container):
 
 
 def read_video_pyav(video_path, num_frm=8):
+    container = av.open(video_path)
+
     if "webm" not in video_path and "mkv" not in video_path:
         # For mp4, we try loading with stream first
         try:
 
@@ -0,0 +1,29 @@
+dataset_path: lmms-lab/LiveBench
+dataset_kwargs:
+  token: True
+task: "live_bench"
+test_split: test
+dataset_name: 2024-06
+output_type: generate_until
+doc_to_visual: !function utils.livebench_doc_to_visual
+doc_to_text: !function utils.livebench_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.livebench_process_results
+metric_list:
+  - metric: gpt4_eval_score
+    aggregation: !function utils.livebench_aggregate_results
+    higher_is_better: true
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""
+metadata:
+  version: "2024-06"
+  api_type : openai
+  gpt_eval_model_name: "gpt-4o"
@@ -0,0 +1,197 @@
+from pathlib import Path
+import yaml
+import os
+import requests
+import logging
+import time
+import base64
+import openai
+import json
+from io import BytesIO
+from tqdm import tqdm
+import pandas as pd
+import numpy as np
+
+
+eval_logger = logging.getLogger("lmms-eval")
+
+
+with open(Path(__file__).parent / "live_bench.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+
+    config = yaml.safe_load("".join(safe_data))
+
+GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
+API_TYPE = config["metadata"]["api_type"]
+
+if API_TYPE == "openai":
+    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
+
+elif API_TYPE == "azure":
+    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "api-key": API_KEY,
+        "Content-Type": "application/json",
+    }
+
+_PROMPT_WITH_IMAGE = """\
+[Question]
+
+{prompt}
+
+[Assistant Response]
+
+{generation}
+
+[Ground Truth Response]
+
+{reference}
+
+[System]
+
+Rate whether the assistant response correctly matches the ground truth, in regards to the image above.
+
+The rating should be 0-10, where 0 is incorrect and 10 is correct.
+
+Below is the specific criteria for rating:
+
+{criteria}
+
+Total score is out of 10. If the model's answer cannot be provided due to political reasons, please assign a score of 0 for further processing. If the model's response is biased due to political factors, please score it based on its understanding of the image, but reduce the objectivity score accordingly.
+
+Your response should be in the JSON format:
+```json
+{{
+    "Explanation": "(your explanation)",
+    "Rating": "(int)"
+}}
+```
+"""
+
+
+def format_prompt(question, ground_truth_answer, answer, criteria):
+    return _PROMPT_WITH_IMAGE.format(prompt=question, generation=answer, reference=ground_truth_answer, criteria=criteria)
+
+
+def get_chat_response(base64_images, question, ground_truth_answer, answer, criteria, max_retries=5, wait_time=10):
+    client = openai.OpenAI(api_key=API_KEY)
+
+    content = []
+    for base64_image in base64_images:
+        content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}})
+    prompt = format_prompt(question, ground_truth_answer, answer, criteria)
+    content.append(
+        {
+            "type": "text",
+            "text": prompt,
+        }
+    )
+
+    messages = [
+        {
+            "role": "user",
+            "content": content,
+        }
+    ]
+
+    # payload = {
+    #     "model": GPT_EVAL_MODEL_NAME,
+    #     "response_format": {"type": "json_object"},
+    #     "max_tokens": 1024,
+    #     "temperature": 0.0,
+    # }
+
+    for attempt in range(max_retries):
+        try:
+            response = client.chat.completions.create(model=GPT_EVAL_MODEL_NAME, messages=messages, max_tokens=1024, response_format={"type": "json_object"}, temperature=0.0)
+            response_data = response.choices[0].message.content
+            # print(response_data)
+            response_data = json.loads(response_data)
+            rating = response_data["Rating"]
+            explanation = response_data["Explanation"]
+            return rating, explanation, GPT_EVAL_MODEL_NAME
+        except requests.exceptions.RequestException as e:
+            eval_logger.warning(f"Request failed on attempt {attempt + 1}: {e}")
+            time.sleep(wait_time)
+            if attempt == max_retries - 1:
+                eval_logger.error(f"Failed to get response after {max_retries} attempts")
+                return -1, str(e), GPT_EVAL_MODEL_NAME
+        except Exception as e:
+            eval_logger.error(f"Error on attempt {attempt + 1}: {e}")
+            return -1, str(e), GPT_EVAL_MODEL_NAME
+
+
+def image_to_base64(pil_image):
+    buffered = BytesIO()
+    pil_image.save(buffered, format="PNG")
+    return base64.b64encode(buffered.getvalue()).decode("utf-8")
+
+
+_images = {}
+
+dataset = None
+
+
+def livebench_doc_to_visual(doc):
+    img_list = [image.convert("RGB") for image in doc["images"]]
+    return img_list
+
+
+def livebench_doc_to_text(doc, model_specific_prompt_kwargs=None):
+    if model_specific_prompt_kwargs is None:
+        model_specific_prompt_kwargs = {}
+    pre_prompt = model_specific_prompt_kwargs.get("pre_prompt", "")
+    post_prompt = model_specific_prompt_kwargs.get("post_prompt", "")
+    return f"{pre_prompt}{doc['question']}{post_prompt}"
+
+
+SUBTASKS = ("Basic Understanding", "Contextual Analysis", "Deeper Implications", "Broader Implications", "Further Insights")
+
+
+def livebench_process_results(doc, results):
+    base64_images = [image_to_base64(image) for image in livebench_doc_to_visual(doc)]
+    subtask = doc["subtask"]
+    criteria = doc["criteria"]
+    if subtask not in SUBTASKS:
+        subtask = "further insights"
+    if not results:
+        return {"gpt4_eval_score": {"rating": -1, "explanation": "No response", "model_name": "N/A", "subtask": subtask}}
+    rating, explanation, model_name = get_chat_response(base64_images=base64_images, question=doc["question"], ground_truth_answer=doc["answer"], answer=results[0] if results else "", criteria=criteria)
+    if rating >= 0:
+        return {"gpt4_eval_score": {"rating": rating, "explanation": explanation, "model_name": model_name, "subtask": subtask, "id": doc["id"]}}
+    else:
+        return {"gpt4_eval_score": {"rating": -1, "explanation": explanation, "model_name": "N/A", "subtask": subtask, "id": doc["id"]}}
+
+
+def livebench_aggregate_results(results):
+    sum_score, count = 0, 0
+    score = {}
+    for subtask in SUBTASKS:
+        score[subtask] = []
+    for result in results:
+        if result["rating"] == -1:
+            continue
+        sum_score += result["rating"] / 10
+        count += 1
+        subtask = result["subtask"]
+        if subtask not in SUBTASKS:
+            subtask = "further insights"
+        score[result["subtask"]].append(result["rating"] / 10)
+    res = pd.DataFrame([(subtask, len(score[subtask]), np.mean(score[subtask]) * 100) for subtask in SUBTASKS], columns=["Subtask", "Count", "Average Score"])
+    print("=" * 50)
+    print(res)
+    print("=" * 50)
+    if count == 0:
+        eval_logger.warning("No valid scores to aggregate")
+    return sum_score / count * 100 if count > 0 else None
@@ -0,0 +1,12 @@
+from live_bench.websites import load_websites, load_websites_from_file
+from live_bench import LiveBench
+
+
+if __name__ == "__main__":
+    website = load_websites()
+    dataset = LiveBench(force_clear=False, name="2024-06")
+    dataset.capture(websites=website, driver_kwargs={"headless": True}, screen_shoter="single_screen", shoter_kwargs={"screen_size": (1024, 1024)}, qa_generator="gpt4v", scorer="gpt4v", checker="gemini")
+
+    website = load_websites_from_file("/data/pufanyi/project/lmms-eval/temp/images")
+    dataset.capture(websites=website, screen_shoter="human", qa_generator="gpt4v", scorer="gpt4v", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={})
+    dataset.upload()