diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/image_caption.py b/eval_mm/vlmevalkit/vlmeval/dataset/image_caption.py index 23282805..3f5ea03f 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/image_caption.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/image_caption.py @@ -34,7 +34,7 @@ def compute_scores(self): for key, value in total_scores.items(): print('{}:{}'.format(key, value)) return total_scores - + class ImageCaptionDataset(ImageBaseDataset): diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py b/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py index c45c412d..13e3549d 100644 --- a/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py +++ b/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py @@ -104,7 +104,6 @@ def unzip_hf_zip(pth): pth = os.path.join(pth, 'video/') for filename in os.listdir(pth): if filename.endswith('.zip'): - # 构建完整的文件路径 zip_path = os.path.join(pth, filename) # 解压 ZIP 文件 diff --git a/web_demos/web_demo_streamlit-2_5.py b/web_demos/web_demo_streamlit-2_5.py index 4cee58c6..aa14ef97 100644 --- a/web_demos/web_demo_streamlit-2_5.py +++ b/web_demos/web_demo_streamlit-2_5.py @@ -6,10 +6,6 @@ # Model path model_path = "openbmb/MiniCPM-Llama3-V-2_5" -# User and assistant names -U_NAME = "User" -A_NAME = "Assistant" - # Set page configuration st.set_page_config( page_title="MiniCPM-Llama3-V-2_5 Streamlit", @@ -17,93 +13,69 @@ layout="wide" ) - # Load model and tokenizer @st.cache_resource def load_model_and_tokenizer(): print(f"load_model_and_tokenizer from {model_path}") - model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16).to(device="cuda") + model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16).to(device="cuda" if torch.cuda.is_available() else "cpu") # Handle CUDA availability tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) return model, tokenizer - # Initialize session state if 'model' not in st.session_state: st.session_state.model, st.session_state.tokenizer = load_model_and_tokenizer() st.session_state.model.eval() - print("model and tokenizer had loaded completed!") + print("model and tokenizer loaded successfully!") -# Initialize session state if 'chat_history' not in st.session_state: st.session_state.chat_history = [] # Sidebar settings -sidebar_name = st.sidebar.title("MiniCPM-Llama3-V-2_5 Streamlit") +st.sidebar.title("MiniCPM-Llama3-V-2_5 Streamlit") max_length = st.sidebar.slider("max_length", 0, 4096, 2048, step=2) repetition_penalty = st.sidebar.slider("repetition_penalty", 0.0, 2.0, 1.05, step=0.01) top_p = st.sidebar.slider("top_p", 0.0, 1.0, 0.8, step=0.01) top_k = st.sidebar.slider("top_k", 0, 100, 100, step=1) temperature = st.sidebar.slider("temperature", 0.0, 1.0, 0.7, step=0.01) -# Clear chat history button -buttonClean = st.sidebar.button("Clear chat history", key="clean") -if buttonClean: +if st.sidebar.button("Clear chat history", key="clean"): st.session_state.chat_history = [] st.session_state.response = "" if torch.cuda.is_available(): torch.cuda.empty_cache() - st.rerun() + st.experimental_rerun() # Update this method to clear state # Display chat history -for i, message in enumerate(st.session_state.chat_history): +for message in st.session_state.chat_history: if message["role"] == "user": - with st.chat_message(name="user", avatar="user"): - if message["image"] is not None: - st.image(message["image"], caption='User uploaded image', width=448, use_column_width=False) - continue - elif message["content"] is not None: - st.markdown(message["content"]) + st.chat_message(name="user", avatar="user").markdown(message["content"]) else: - with st.chat_message(name="model", avatar="assistant"): - st.markdown(message["content"]) + st.chat_message(name="model", avatar="assistant").markdown(message["content"]) -# Select mode +# Handle image uploads selected_mode = st.sidebar.selectbox("Select mode", ["Text", "Image"]) if selected_mode == "Image": - # Image mode - uploaded_image = st.sidebar.file_uploader("Upload image", key=1, type=["jpg", "jpeg", "png"], - accept_multiple_files=False) + uploaded_image = st.sidebar.file_uploader("Upload image", type=["jpg", "jpeg", "png"], accept_multiple_files=False) if uploaded_image is not None: - st.image(uploaded_image, caption='User uploaded image', width=468, use_column_width=False) - # Add uploaded image to chat history + st.image(uploaded_image, caption="User uploaded image", use_column_width=True) st.session_state.chat_history.append({"role": "user", "content": None, "image": uploaded_image}) # User input box user_text = st.chat_input("Enter your question") if user_text: - with st.chat_message(U_NAME, avatar="user"): - st.session_state.chat_history.append({"role": "user", "content": user_text, "image": None}) - st.markdown(f"{U_NAME}: {user_text}") + st.session_state.chat_history.append({"role": "user", "content": user_text, "image": None}) + st.chat_message(U_NAME, avatar="user").markdown(f"{U_NAME}: {user_text}") - # Generate reply using the model model = st.session_state.model tokenizer = st.session_state.tokenizer imagefile = None - with st.chat_message(A_NAME, avatar="assistant"): - # If the previous message contains an image, pass the image to the model - if len(st.session_state.chat_history) > 1 and st.session_state.chat_history[-2]["image"] is not None: - uploaded_image = st.session_state.chat_history[-2]["image"] - imagefile = Image.open(uploaded_image).convert('RGB') - - msgs = [{"role": "user", "content": user_text}] - res = model.chat(image=imagefile, msgs=msgs, context=None, tokenizer=tokenizer, - sampling=True, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty, - temperature=temperature, stream=True) - - # Collect the generated_text str - generated_text = st.write_stream(res) + if len(st.session_state.chat_history) > 1 and st.session_state.chat_history[-2]["image"] is not None: + imagefile = Image.open(st.session_state.chat_history[-2]["image"]).convert('RGB') - st.session_state.chat_history.append({"role": "model", "content": generated_text, "image": None}) + msgs = [{"role": "user", "content": user_text}] + res = model.chat(image=imagefile, msgs=msgs, context=None, tokenizer=tokenizer, sampling=True, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty, temperature=temperature, stream=True) + generated_text = st.empty().write_stream(res) # Modify to handle stream correctly + st.session_state.chat_history.append({"role": "model", "content": generated_text, "image": None}) st.divider() diff --git a/web_demos/web_demo_streamlit.py b/web_demos/web_demo_streamlit.py index 1b893e6e..909f6b03 100644 --- a/web_demos/web_demo_streamlit.py +++ b/web_demos/web_demo_streamlit.py @@ -1,99 +1,118 @@ -import streamlit as st +import gradio as gr from PIL import Image +import traceback +import re import torch +import argparse from transformers import AutoModel, AutoTokenizer -# Model path -model_path = "openbmb/MiniCPM-V-2" +# README, How to run demo on different devices +# For Nvidia GPUs support BF16 (like A100, H100, RTX3090) +# python web_demo.py --device cuda --dtype bf16 -# User and assistant names -U_NAME = "User" -A_NAME = "Assistant" +# For Nvidia GPUs do NOT support BF16 (like V100, T4, RTX2080) +# python web_demo.py --device cuda --dtype fp16 -# Set page configuration -st.set_page_config( - page_title="Minicpm-V-2 Streamlit", - page_icon=":robot:", - layout="wide" -) +# For Mac with MPS (Apple silicon or AMD GPUs). +# PYTORCH_ENABLE_MPS_FALLBACK=1 python web_demo.py --device mps --dtype fp16 -# Load model and tokenizer -@st.cache_resource -def load_model_and_tokenizer(): - print(f"load_model_and_tokenizer from {model_path}") - model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16).to( - device="cuda:0", dtype=torch.bfloat16) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - return model, tokenizer - -# Initialize session state -if 'model' not in st.session_state: - st.session_state.model, st.session_state.tokenizer = load_model_and_tokenizer() - print("model and tokenizer had loaded completed!") +# Argparser +parser = argparse.ArgumentParser(description='demo') +parser.add_argument('--device', type=str, default='cuda', help='cuda or mps') +parser.add_argument('--dtype', type=str, default='bf16', help='bf16 or fp16') +args = parser.parse_args() +device = args.device +assert device in ['cuda', 'mps'] +if args.dtype == 'bf16': + if device == 'mps': + print('Warning: MPS does not support bf16, will use fp16 instead') + dtype = torch.float16 + else: + dtype = torch.bfloat16 +else: + dtype = torch.float16 -# Initialize session state -if 'chat_history' not in st.session_state: - st.session_state.chat_history = [] +# Load model +model_path = 'openbmb/MiniCPM-V-2' +model = AutoModel.from_pretrained(model_path, trust_remote_code=True).to(dtype=torch.bfloat16) +tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) -# Sidebar settings -sidebar_name = st.sidebar.title("Minicpm-V-2 Streamlit") -max_length = st.sidebar.slider("max_length", 0, 4096, 2048, step=2) -top_p = st.sidebar.slider("top_p", 0.0, 1.0, 0.8, step=0.01) -temperature = st.sidebar.slider("temperature", 0.0, 1.0, 0.7, step=0.01) +model = model.to(device=device, dtype=dtype) +model.eval() -# Clear chat history button -buttonClean = st.sidebar.button("Clear chat history", key="clean") -if buttonClean: - st.session_state.chat_history = [] - st.session_state.response = "" - if torch.cuda.is_available(): - torch.cuda.empty_cache() - st.rerun() +ERROR_MSG = "Error, please retry" +model_name = 'MiniCPM-V 2.0' -# Display chat history -for i, message in enumerate(st.session_state.chat_history): - if message["role"] == "user": - with st.chat_message(name="user", avatar="user"): - if message["image"] is not None: - st.image(message["image"], caption='User uploaded image', width=468, use_column_width=False) - continue - elif message["content"] is not None: - st.markdown(message["content"]) - else: - with st.chat_message(name="model", avatar="assistant"): - st.markdown(message["content"]) +form_radio = {'choices': ['Beam Search', 'Sampling'],'value': 'Sampling','interactive': True,'label': 'Decode Type'} +# Beam Form +num_beams_slider = {'minimum': 0,'maximum': 5,'value': 3,'step': 1,'interactive': True,'label': 'Num Beams'} +repetition_penalty_slider = {'minimum': 0,'maximum': 3,'value': 1.2,'step': 0.01,'interactive': True,'label': 'Repetition Penalty'} +repetition_penalty_slider2 = {'minimum': 0,'maximum': 3,'value': 1.05,'step': 0.01,'interactive': True,'label': 'Repetition Penalty'} -# Select mode -selected_mode = st.sidebar.selectbox("Select mode", ["Text", "Image"]) -if selected_mode == "Image": - # Image mode - uploaded_image = st.sidebar.file_uploader("Upload image", key=1, type=["jpg", "jpeg", "png"], accept_multiple_files=False) - if uploaded_image is not None: - st.image(uploaded_image, caption='User uploaded image', width=468, use_column_width=False) - # Add uploaded image to chat history - st.session_state.chat_history.append({"role": "user", "content": None, "image": uploaded_image}) +# Handling both max_new_tokens_slider and top_p_slider +max_new_tokens_slider = {'minimum': 1,'maximum': 4096,'value': 1024,'step': 1,'interactive': True,'label': 'Max New Tokens'} +top_p_slider = {'minimum': 0,'maximum': 1,'value': 0.8,'step': 0.05,'interactive': True,'label': 'Top P'} +top_k_slider = {'minimum': 0,'maximum': 200,'value': 100,'step': 1,'interactive': True,'label': 'Top K'} +temperature_slider = {'minimum': 0, 'maximum': 2,'value': 0.7,'step': 0.05,'interactive': True,'label': 'Temperature'} -# User input box -user_text = st.chat_input("Enter your question") -if user_text: - with st.chat_message(U_NAME, avatar="user"): - st.session_state.chat_history.append({"role": "user", "content": user_text, "image": None}) - st.markdown(f"{U_NAME}: {user_text}") +def create_component(params, comp='Slider'): + if comp == 'Slider': + return gr.Slider(minimum=params['minimum'],maximum=params['maximum'],value=params['value'],step=params['step'],interactive=params['interactive'],label=params['label']) + elif comp == 'Radio': + return gr.Radio(choices=params['choices'],value=params['value'],interactive=params['interactive'],label=params['label']) + elif comp == 'Button': + return gr.Button(value=params['value'],interactive=True) - # Generate reply using the model - model = st.session_state.model - tokenizer = st.session_state.tokenizer +def chat(img, msgs, ctx, params=None, vision_hidden_states=None): + default_params = {"num_beams":3, "repetition_penalty": 1.2, "max_new_tokens": 1024} + if params is None: + params = default_params + if img is None: + return -1, "Error, invalid image, please upload a new image", None, None + try: + image = img.convert('RGB') + answer, context, _ = model.chat(image=image,msgs=msgs,context=None,tokenizer=tokenizer,**params) + res = re.sub(r'(.*)', '', answer) + res = res.replace('', '') + res = res.replace('', '') + res = res.replace('', '') + answer = res.replace('', '') + return 0, answer, None, None + except Exception as err: + print(err) + traceback.print_exc() + return -1, ERROR_MSG, None, None - with st.chat_message(A_NAME, avatar="assistant"): - # If the previous message contains an image, pass the image to the model - if len(st.session_state.chat_history) > 1 and st.session_state.chat_history[-2]["image"] is not None: - uploaded_image = st.session_state.chat_history[-2]["image"] - imagefile = Image.open(uploaded_image).convert('RGB') +def upload_img(image, _chatbot, _app_session): + image = Image.fromarray(image) + _app_session['sts']=None + _app_session['ctx']=[] + _app_session['img']=image + _chatbot.append(('', 'Image uploaded successfully, you can talk to me now')) + return _chatbot, _app_session - msgs = [{"role": "user", "content": user_text}] - res, context, _ = model.chat(image=imagefile, msgs=msgs, context=None, tokenizer=tokenizer, - sampling=True,top_p=top_p,temperature=temperature) - st.markdown(f"{A_NAME}: {res}") - st.session_state.chat_history.append({"role": "model", "content": res, "image": None}) +def respond(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature): + if _app_cfg.get('ctx', None) is None: + _chat_bot.append((_question, 'Please upload an image to start')) + return '', _chat_bot, _app_cfg + _context = _app_cfg['ctx'].copy() + if _context: + _context.append({"role": "user", "content": _question}) + else: + _context = [{"role": "user", "content": _question}] + print(':', _question) + if params_form == 'Beam Search': + params = {'sampling': False,'num_beams': num_beams,'repetition_penalty': repetition_penalty,"max_new_tokens": 896 } + else: + params = {'sampling': True,'top_p': top_p,'top_k': top_k,'temperature': temperature,'repetition_penalty': repetition_penalty_2,"max_new_tokens": 896 } + code, _answer, _, sts = chat(_app_cfg['img'], _context, None, params) + print(':', _answer) + _context.append({"role": "assistant", "content": _answer}) + _chat_bot.append((_question, _answer)) + if code == 0: + _app_cfg['ctx']=_context + _app_cfg['sts']=sts + return '', _chat_bot, _app_cfg - st.divider() +def regenerate_button_clicked(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature): + return respond(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature) \ No newline at end of file