diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/image_caption.py b/eval_mm/vlmevalkit/vlmeval/dataset/image_caption.py
index 23282805..3f5ea03f 100644
--- a/eval_mm/vlmevalkit/vlmeval/dataset/image_caption.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/image_caption.py
@@ -34,7 +34,7 @@ def compute_scores(self):
for key, value in total_scores.items():
print('{}:{}'.format(key, value))
return total_scores
-
+
class ImageCaptionDataset(ImageBaseDataset):
diff --git a/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py b/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py
index c45c412d..13e3549d 100644
--- a/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py
+++ b/eval_mm/vlmevalkit/vlmeval/dataset/mvbench.py
@@ -104,7 +104,6 @@ def unzip_hf_zip(pth):
pth = os.path.join(pth, 'video/')
for filename in os.listdir(pth):
if filename.endswith('.zip'):
- # 构建完整的文件路径
zip_path = os.path.join(pth, filename)
# 解压 ZIP 文件
diff --git a/web_demos/web_demo_streamlit-2_5.py b/web_demos/web_demo_streamlit-2_5.py
index 4cee58c6..aa14ef97 100644
--- a/web_demos/web_demo_streamlit-2_5.py
+++ b/web_demos/web_demo_streamlit-2_5.py
@@ -6,10 +6,6 @@
# Model path
model_path = "openbmb/MiniCPM-Llama3-V-2_5"
-# User and assistant names
-U_NAME = "User"
-A_NAME = "Assistant"
-
# Set page configuration
st.set_page_config(
page_title="MiniCPM-Llama3-V-2_5 Streamlit",
@@ -17,93 +13,69 @@
layout="wide"
)
-
# Load model and tokenizer
@st.cache_resource
def load_model_and_tokenizer():
print(f"load_model_and_tokenizer from {model_path}")
- model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16).to(device="cuda")
+ model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16).to(device="cuda" if torch.cuda.is_available() else "cpu") # Handle CUDA availability
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
return model, tokenizer
-
# Initialize session state
if 'model' not in st.session_state:
st.session_state.model, st.session_state.tokenizer = load_model_and_tokenizer()
st.session_state.model.eval()
- print("model and tokenizer had loaded completed!")
+ print("model and tokenizer loaded successfully!")
-# Initialize session state
if 'chat_history' not in st.session_state:
st.session_state.chat_history = []
# Sidebar settings
-sidebar_name = st.sidebar.title("MiniCPM-Llama3-V-2_5 Streamlit")
+st.sidebar.title("MiniCPM-Llama3-V-2_5 Streamlit")
max_length = st.sidebar.slider("max_length", 0, 4096, 2048, step=2)
repetition_penalty = st.sidebar.slider("repetition_penalty", 0.0, 2.0, 1.05, step=0.01)
top_p = st.sidebar.slider("top_p", 0.0, 1.0, 0.8, step=0.01)
top_k = st.sidebar.slider("top_k", 0, 100, 100, step=1)
temperature = st.sidebar.slider("temperature", 0.0, 1.0, 0.7, step=0.01)
-# Clear chat history button
-buttonClean = st.sidebar.button("Clear chat history", key="clean")
-if buttonClean:
+if st.sidebar.button("Clear chat history", key="clean"):
st.session_state.chat_history = []
st.session_state.response = ""
if torch.cuda.is_available():
torch.cuda.empty_cache()
- st.rerun()
+ st.experimental_rerun() # Update this method to clear state
# Display chat history
-for i, message in enumerate(st.session_state.chat_history):
+for message in st.session_state.chat_history:
if message["role"] == "user":
- with st.chat_message(name="user", avatar="user"):
- if message["image"] is not None:
- st.image(message["image"], caption='User uploaded image', width=448, use_column_width=False)
- continue
- elif message["content"] is not None:
- st.markdown(message["content"])
+ st.chat_message(name="user", avatar="user").markdown(message["content"])
else:
- with st.chat_message(name="model", avatar="assistant"):
- st.markdown(message["content"])
+ st.chat_message(name="model", avatar="assistant").markdown(message["content"])
-# Select mode
+# Handle image uploads
selected_mode = st.sidebar.selectbox("Select mode", ["Text", "Image"])
if selected_mode == "Image":
- # Image mode
- uploaded_image = st.sidebar.file_uploader("Upload image", key=1, type=["jpg", "jpeg", "png"],
- accept_multiple_files=False)
+ uploaded_image = st.sidebar.file_uploader("Upload image", type=["jpg", "jpeg", "png"], accept_multiple_files=False)
if uploaded_image is not None:
- st.image(uploaded_image, caption='User uploaded image', width=468, use_column_width=False)
- # Add uploaded image to chat history
+ st.image(uploaded_image, caption="User uploaded image", use_column_width=True)
st.session_state.chat_history.append({"role": "user", "content": None, "image": uploaded_image})
# User input box
user_text = st.chat_input("Enter your question")
if user_text:
- with st.chat_message(U_NAME, avatar="user"):
- st.session_state.chat_history.append({"role": "user", "content": user_text, "image": None})
- st.markdown(f"{U_NAME}: {user_text}")
+ st.session_state.chat_history.append({"role": "user", "content": user_text, "image": None})
+ st.chat_message(U_NAME, avatar="user").markdown(f"{U_NAME}: {user_text}")
- # Generate reply using the model
model = st.session_state.model
tokenizer = st.session_state.tokenizer
imagefile = None
- with st.chat_message(A_NAME, avatar="assistant"):
- # If the previous message contains an image, pass the image to the model
- if len(st.session_state.chat_history) > 1 and st.session_state.chat_history[-2]["image"] is not None:
- uploaded_image = st.session_state.chat_history[-2]["image"]
- imagefile = Image.open(uploaded_image).convert('RGB')
-
- msgs = [{"role": "user", "content": user_text}]
- res = model.chat(image=imagefile, msgs=msgs, context=None, tokenizer=tokenizer,
- sampling=True, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty,
- temperature=temperature, stream=True)
-
- # Collect the generated_text str
- generated_text = st.write_stream(res)
+ if len(st.session_state.chat_history) > 1 and st.session_state.chat_history[-2]["image"] is not None:
+ imagefile = Image.open(st.session_state.chat_history[-2]["image"]).convert('RGB')
- st.session_state.chat_history.append({"role": "model", "content": generated_text, "image": None})
+ msgs = [{"role": "user", "content": user_text}]
+ res = model.chat(image=imagefile, msgs=msgs, context=None, tokenizer=tokenizer, sampling=True, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty, temperature=temperature, stream=True)
+ generated_text = st.empty().write_stream(res) # Modify to handle stream correctly
+ st.session_state.chat_history.append({"role": "model", "content": generated_text, "image": None})
st.divider()
diff --git a/web_demos/web_demo_streamlit.py b/web_demos/web_demo_streamlit.py
index 1b893e6e..909f6b03 100644
--- a/web_demos/web_demo_streamlit.py
+++ b/web_demos/web_demo_streamlit.py
@@ -1,99 +1,118 @@
-import streamlit as st
+import gradio as gr
from PIL import Image
+import traceback
+import re
import torch
+import argparse
from transformers import AutoModel, AutoTokenizer
-# Model path
-model_path = "openbmb/MiniCPM-V-2"
+# README, How to run demo on different devices
+# For Nvidia GPUs support BF16 (like A100, H100, RTX3090)
+# python web_demo.py --device cuda --dtype bf16
-# User and assistant names
-U_NAME = "User"
-A_NAME = "Assistant"
+# For Nvidia GPUs do NOT support BF16 (like V100, T4, RTX2080)
+# python web_demo.py --device cuda --dtype fp16
-# Set page configuration
-st.set_page_config(
- page_title="Minicpm-V-2 Streamlit",
- page_icon=":robot:",
- layout="wide"
-)
+# For Mac with MPS (Apple silicon or AMD GPUs).
+# PYTORCH_ENABLE_MPS_FALLBACK=1 python web_demo.py --device mps --dtype fp16
-# Load model and tokenizer
-@st.cache_resource
-def load_model_and_tokenizer():
- print(f"load_model_and_tokenizer from {model_path}")
- model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16).to(
- device="cuda:0", dtype=torch.bfloat16)
- tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
- return model, tokenizer
-
-# Initialize session state
-if 'model' not in st.session_state:
- st.session_state.model, st.session_state.tokenizer = load_model_and_tokenizer()
- print("model and tokenizer had loaded completed!")
+# Argparser
+parser = argparse.ArgumentParser(description='demo')
+parser.add_argument('--device', type=str, default='cuda', help='cuda or mps')
+parser.add_argument('--dtype', type=str, default='bf16', help='bf16 or fp16')
+args = parser.parse_args()
+device = args.device
+assert device in ['cuda', 'mps']
+if args.dtype == 'bf16':
+ if device == 'mps':
+ print('Warning: MPS does not support bf16, will use fp16 instead')
+ dtype = torch.float16
+ else:
+ dtype = torch.bfloat16
+else:
+ dtype = torch.float16
-# Initialize session state
-if 'chat_history' not in st.session_state:
- st.session_state.chat_history = []
+# Load model
+model_path = 'openbmb/MiniCPM-V-2'
+model = AutoModel.from_pretrained(model_path, trust_remote_code=True).to(dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-# Sidebar settings
-sidebar_name = st.sidebar.title("Minicpm-V-2 Streamlit")
-max_length = st.sidebar.slider("max_length", 0, 4096, 2048, step=2)
-top_p = st.sidebar.slider("top_p", 0.0, 1.0, 0.8, step=0.01)
-temperature = st.sidebar.slider("temperature", 0.0, 1.0, 0.7, step=0.01)
+model = model.to(device=device, dtype=dtype)
+model.eval()
-# Clear chat history button
-buttonClean = st.sidebar.button("Clear chat history", key="clean")
-if buttonClean:
- st.session_state.chat_history = []
- st.session_state.response = ""
- if torch.cuda.is_available():
- torch.cuda.empty_cache()
- st.rerun()
+ERROR_MSG = "Error, please retry"
+model_name = 'MiniCPM-V 2.0'
-# Display chat history
-for i, message in enumerate(st.session_state.chat_history):
- if message["role"] == "user":
- with st.chat_message(name="user", avatar="user"):
- if message["image"] is not None:
- st.image(message["image"], caption='User uploaded image', width=468, use_column_width=False)
- continue
- elif message["content"] is not None:
- st.markdown(message["content"])
- else:
- with st.chat_message(name="model", avatar="assistant"):
- st.markdown(message["content"])
+form_radio = {'choices': ['Beam Search', 'Sampling'],'value': 'Sampling','interactive': True,'label': 'Decode Type'}
+# Beam Form
+num_beams_slider = {'minimum': 0,'maximum': 5,'value': 3,'step': 1,'interactive': True,'label': 'Num Beams'}
+repetition_penalty_slider = {'minimum': 0,'maximum': 3,'value': 1.2,'step': 0.01,'interactive': True,'label': 'Repetition Penalty'}
+repetition_penalty_slider2 = {'minimum': 0,'maximum': 3,'value': 1.05,'step': 0.01,'interactive': True,'label': 'Repetition Penalty'}
-# Select mode
-selected_mode = st.sidebar.selectbox("Select mode", ["Text", "Image"])
-if selected_mode == "Image":
- # Image mode
- uploaded_image = st.sidebar.file_uploader("Upload image", key=1, type=["jpg", "jpeg", "png"], accept_multiple_files=False)
- if uploaded_image is not None:
- st.image(uploaded_image, caption='User uploaded image', width=468, use_column_width=False)
- # Add uploaded image to chat history
- st.session_state.chat_history.append({"role": "user", "content": None, "image": uploaded_image})
+# Handling both max_new_tokens_slider and top_p_slider
+max_new_tokens_slider = {'minimum': 1,'maximum': 4096,'value': 1024,'step': 1,'interactive': True,'label': 'Max New Tokens'}
+top_p_slider = {'minimum': 0,'maximum': 1,'value': 0.8,'step': 0.05,'interactive': True,'label': 'Top P'}
+top_k_slider = {'minimum': 0,'maximum': 200,'value': 100,'step': 1,'interactive': True,'label': 'Top K'}
+temperature_slider = {'minimum': 0, 'maximum': 2,'value': 0.7,'step': 0.05,'interactive': True,'label': 'Temperature'}
-# User input box
-user_text = st.chat_input("Enter your question")
-if user_text:
- with st.chat_message(U_NAME, avatar="user"):
- st.session_state.chat_history.append({"role": "user", "content": user_text, "image": None})
- st.markdown(f"{U_NAME}: {user_text}")
+def create_component(params, comp='Slider'):
+ if comp == 'Slider':
+ return gr.Slider(minimum=params['minimum'],maximum=params['maximum'],value=params['value'],step=params['step'],interactive=params['interactive'],label=params['label'])
+ elif comp == 'Radio':
+ return gr.Radio(choices=params['choices'],value=params['value'],interactive=params['interactive'],label=params['label'])
+ elif comp == 'Button':
+ return gr.Button(value=params['value'],interactive=True)
- # Generate reply using the model
- model = st.session_state.model
- tokenizer = st.session_state.tokenizer
+def chat(img, msgs, ctx, params=None, vision_hidden_states=None):
+ default_params = {"num_beams":3, "repetition_penalty": 1.2, "max_new_tokens": 1024}
+ if params is None:
+ params = default_params
+ if img is None:
+ return -1, "Error, invalid image, please upload a new image", None, None
+ try:
+ image = img.convert('RGB')
+ answer, context, _ = model.chat(image=image,msgs=msgs,context=None,tokenizer=tokenizer,**params)
+ res = re.sub(r'(.*)', '', answer)
+ res = res.replace('[', '')
+ res = res.replace(']', '')
+ res = res.replace('', '')
+ answer = res.replace('', '')
+ return 0, answer, None, None
+ except Exception as err:
+ print(err)
+ traceback.print_exc()
+ return -1, ERROR_MSG, None, None
- with st.chat_message(A_NAME, avatar="assistant"):
- # If the previous message contains an image, pass the image to the model
- if len(st.session_state.chat_history) > 1 and st.session_state.chat_history[-2]["image"] is not None:
- uploaded_image = st.session_state.chat_history[-2]["image"]
- imagefile = Image.open(uploaded_image).convert('RGB')
+def upload_img(image, _chatbot, _app_session):
+ image = Image.fromarray(image)
+ _app_session['sts']=None
+ _app_session['ctx']=[]
+ _app_session['img']=image
+ _chatbot.append(('', 'Image uploaded successfully, you can talk to me now'))
+ return _chatbot, _app_session
- msgs = [{"role": "user", "content": user_text}]
- res, context, _ = model.chat(image=imagefile, msgs=msgs, context=None, tokenizer=tokenizer,
- sampling=True,top_p=top_p,temperature=temperature)
- st.markdown(f"{A_NAME}: {res}")
- st.session_state.chat_history.append({"role": "model", "content": res, "image": None})
+def respond(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature):
+ if _app_cfg.get('ctx', None) is None:
+ _chat_bot.append((_question, 'Please upload an image to start'))
+ return '', _chat_bot, _app_cfg
+ _context = _app_cfg['ctx'].copy()
+ if _context:
+ _context.append({"role": "user", "content": _question})
+ else:
+ _context = [{"role": "user", "content": _question}]
+ print(':', _question)
+ if params_form == 'Beam Search':
+ params = {'sampling': False,'num_beams': num_beams,'repetition_penalty': repetition_penalty,"max_new_tokens": 896 }
+ else:
+ params = {'sampling': True,'top_p': top_p,'top_k': top_k,'temperature': temperature,'repetition_penalty': repetition_penalty_2,"max_new_tokens": 896 }
+ code, _answer, _, sts = chat(_app_cfg['img'], _context, None, params)
+ print(':', _answer)
+ _context.append({"role": "assistant", "content": _answer})
+ _chat_bot.append((_question, _answer))
+ if code == 0:
+ _app_cfg['ctx']=_context
+ _app_cfg['sts']=sts
+ return '', _chat_bot, _app_cfg
- st.divider()
+def regenerate_button_clicked(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature):
+ return respond(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature)
\ No newline at end of file