Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Request for Official Code to Reproduce MMVP Performance on Eagle-X4-8B-Plus #14

Open
KiUngSong opened this issue Sep 12, 2024 · 3 comments
Assignees

Comments

@KiUngSong
Copy link

I am trying to reproduce the MMVP benchmark performance of Eagle-X4-8B-Plus.
Could you provide the official code or scripts for this, as I am unable to match the published benchmarks?

@flyinglynx
Copy link
Collaborator

Thank you for pointing this out. We were also curious about the unusually high performance 😂, and after reviewing it today, I found the issue: we were using an incorrect evaluation metric. We'll update the results ASAP, but I'd like to explain the situation here first.

The official MMVP evaluation code requires both questions in a pair to be answered correctly for the sample to be marked as correct. However, in our code, we only calculated the probability of each question being answered correctly, which led to the inconsistency.

Below is the code we used for evaluation:

import argparse
import torch
import os
import json
from tqdm import tqdm
import shortuuid

# for debug
import sys
sys.path.append(os.getcwd())

from eagle.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from eagle.conversation import conv_templates, SeparatorStyle
from eagle.model.builder import load_pretrained_model
from eagle.utils import disable_torch_init
from eagle.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from torch.utils.data import Dataset, DataLoader

import math
import pandas as pd
from PIL import Image
import os
from tqdm import tqdm
from copy import deepcopy

def calculate_score(predictions):

    correct, total = 0, 0
   
    for idx, prediction in enumerate(predictions):
        if gt.lower() in answer.lower() or answer.lower() in gt.lower():
            correct += 1
        total += 1
        
    print(f"Accuracy: {correct / total}")

def eval_model(args):
    # Model
    disable_torch_init()
    model_path = os.path.expanduser(args.model_path)
    model_name = get_model_name_from_path(model_path)
    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)

    benchmark_dir = os.path.join(args.directory, 'Questions.csv')
    # Load and read the CSV
    df = pd.read_csv(benchmark_dir)  # Assuming the fields are separated by tabs

    answers_file = os.path.expanduser(args.answers_file)
    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
    ans_file = open(answers_file, "w")
    all_answers = []

    for index, row in tqdm(df.iterrows()):
        cur_prompt = row['Question'] + " " + row['Options']
        qs = cur_prompt       
        qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + "\nAnswer with the option's letter from the given choices directly."

        conv = conv_templates[args.conv_mode].copy()
        conv.append_message(conv.roles[0], qs)
        conv.append_message(conv.roles[1], None)
        prompt = conv.get_prompt()

        # Load the corresponding image
        photo_id = index+1
        image_path = os.path.join(args.directory, 'MMVP Images', f"{photo_id}.jpg")
        image = Image.open(image_path)
        image_sizes = [image.size]

        image_tensor = process_images([image], image_processor, model.config)
        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
        input_ids = input_ids.to(device='cuda', non_blocking=True).unsqueeze(0)

        with torch.inference_mode():
            output_ids = model.generate(
                input_ids,
                images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
                image_sizes=image_sizes,
                do_sample=True if args.temperature > 0 else False,
                temperature=args.temperature,
                top_p=args.top_p,
                num_beams=args.num_beams,
                max_new_tokens=64,
                use_cache=True)

        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

        ans_id = shortuuid.uuid()

        answer_dict = {"question_id": photo_id,
                                   "prompt": cur_prompt,
                                   "answer": row["Correct Answer"], 
                                   "response": outputs,
                                   "answer_id": ans_id,
                                   "model_id": model_name,
                      }
        all_answers.append(answer_dict)
        ans_file.write(json.dumps(answer_dict) + "\n")
        ans_file.flush()

    ans_file.close()
    calculate_score(all_answers)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", type=str, default="PATH_TO_MLLM")
    parser.add_argument("--model-base", type=str, default=None)
    parser.add_argument("--directory", type=str, default="PATH_TO_MMVP_DATASET")
    parser.add_argument("--answers-file", type=str, default="playground/data/eval_local_files/mmvp/debug/answers.jsonl")
    parser.add_argument("--conv-mode", type=str, default="llava_v1")
    parser.add_argument("--num-chunks", type=int, default=1)
    parser.add_argument("--chunk-idx", type=int, default=0)
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--top_p", type=float, default=None)
    parser.add_argument("--num_beams", type=int, default=1)
    args = parser.parse_args()

    eval_model(args)

Aside from the metric, we also used direct string matching to compute performance. We are now using the GPT API to judge whether responses are correct, and I will update you with the latest scores soon.

Thank you again for raising this issue—this was our mistake, and we will be updating the Arxiv tech report accordingly.

@flyinglynx flyinglynx self-assigned this Sep 13, 2024
@dingangui
Copy link

dingangui commented Sep 24, 2024

I tested it and the accuracy rate is 53.3%, using GPT-4-Turbo to judge the answer.

@dingangui
Copy link

dingangui commented Sep 27, 2024

Thank you for pointing this out. We were also curious about the unusually high performance 😂, and after reviewing it today, I found the issue: we were using an incorrect evaluation metric. We'll update the results ASAP, but I'd like to explain the situation here first.

The official MMVP evaluation code requires both questions in a pair to be answered correctly for the sample to be marked as correct. However, in our code, we only calculated the probability of each question being answered correctly, which led to the inconsistency.

Below is the code we used for evaluation:

import argparse
import torch
import os
import json
from tqdm import tqdm
import shortuuid

# for debug
import sys
sys.path.append(os.getcwd())

from eagle.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from eagle.conversation import conv_templates, SeparatorStyle
from eagle.model.builder import load_pretrained_model
from eagle.utils import disable_torch_init
from eagle.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from torch.utils.data import Dataset, DataLoader

import math
import pandas as pd
from PIL import Image
import os
from tqdm import tqdm
from copy import deepcopy

def calculate_score(predictions):

    correct, total = 0, 0
   
    for idx, prediction in enumerate(predictions):
        if gt.lower() in answer.lower() or answer.lower() in gt.lower():
            correct += 1
        total += 1
        
    print(f"Accuracy: {correct / total}")

def eval_model(args):
    # Model
    disable_torch_init()
    model_path = os.path.expanduser(args.model_path)
    model_name = get_model_name_from_path(model_path)
    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)

    benchmark_dir = os.path.join(args.directory, 'Questions.csv')
    # Load and read the CSV
    df = pd.read_csv(benchmark_dir)  # Assuming the fields are separated by tabs

    answers_file = os.path.expanduser(args.answers_file)
    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
    ans_file = open(answers_file, "w")
    all_answers = []

    for index, row in tqdm(df.iterrows()):
        cur_prompt = row['Question'] + " " + row['Options']
        qs = cur_prompt       
        qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + "\nAnswer with the option's letter from the given choices directly."

        conv = conv_templates[args.conv_mode].copy()
        conv.append_message(conv.roles[0], qs)
        conv.append_message(conv.roles[1], None)
        prompt = conv.get_prompt()

        # Load the corresponding image
        photo_id = index+1
        image_path = os.path.join(args.directory, 'MMVP Images', f"{photo_id}.jpg")
        image = Image.open(image_path)
        image_sizes = [image.size]

        image_tensor = process_images([image], image_processor, model.config)
        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
        input_ids = input_ids.to(device='cuda', non_blocking=True).unsqueeze(0)

        with torch.inference_mode():
            output_ids = model.generate(
                input_ids,
                images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
                image_sizes=image_sizes,
                do_sample=True if args.temperature > 0 else False,
                temperature=args.temperature,
                top_p=args.top_p,
                num_beams=args.num_beams,
                max_new_tokens=64,
                use_cache=True)

        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

        ans_id = shortuuid.uuid()

        answer_dict = {"question_id": photo_id,
                                   "prompt": cur_prompt,
                                   "answer": row["Correct Answer"], 
                                   "response": outputs,
                                   "answer_id": ans_id,
                                   "model_id": model_name,
                      }
        all_answers.append(answer_dict)
        ans_file.write(json.dumps(answer_dict) + "\n")
        ans_file.flush()

    ans_file.close()
    calculate_score(all_answers)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", type=str, default="PATH_TO_MLLM")
    parser.add_argument("--model-base", type=str, default=None)
    parser.add_argument("--directory", type=str, default="PATH_TO_MMVP_DATASET")
    parser.add_argument("--answers-file", type=str, default="playground/data/eval_local_files/mmvp/debug/answers.jsonl")
    parser.add_argument("--conv-mode", type=str, default="llava_v1")
    parser.add_argument("--num-chunks", type=int, default=1)
    parser.add_argument("--chunk-idx", type=int, default=0)
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--top_p", type=float, default=None)
    parser.add_argument("--num_beams", type=int, default=1)
    args = parser.parse_args()

    eval_model(args)

Aside from the metric, we also used direct string matching to compute performance. We are now using the GPT API to judge whether responses are correct, and I will update you with the latest scores soon.

Thank you again for raising this issue—this was our mistake, and we will be updating the Arxiv tech report accordingly.

Should the conv-mode for Eagle-X4-8B-Plus be 'llava_v1' or 'llama3'?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants