Qwen2-VL模型系列包括不同规模的版本,如 2B、7B 和 72B 参数规模的模型,以适应不同的应用需求和计算资源限制。
Qwen2-VL可以处理不同分辨率和长宽比的图片,无需将图片分割成块,并且在各种视觉理解基准测试中表现出色,例如 MathVista(数学推理)、DocVQA(文档图像理解)、RealWorldQA(现实世界空间理解)以及 MTVQA(多语言理解)等。
Qwen2-VL 还能够理解长达20分钟以上的视频内容,这使得它能够在基于视频的问答、对话生成和内容创作等方面发挥作用。
pip install torch==2.4.0+cu118 torchvision==0.19.0+cu118 torchaudio==2.4.0 --extra-index-url https://download.pytorch.org/whl/cu118
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install av -i https://pypi.tuna.tsinghua.edu.cn/simple
git lfs install
git clone https://www.modelscope.cn/qwen/Qwen2-VL-7B-Instruct.git
git lfs install
git clone https://www.modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct.git
from PIL import Image
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
class VisionLanguageModel:
def __init__(self, model_dir, min_pixels, max_pixels):
self.model = Qwen2VLForConditionalGeneration.from_pretrained(model_dir, device_map="auto", torch_dtype=torch.float16)
self.processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
def prepare_inputs(self, messages):
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = self.processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
inputs = inputs.to('cuda')
return inputs
def generate_output(self, inputs):
generated_ids = self.model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = self.processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
return output_text
def describe_image(self, image_url):
messages = [{"role": "user", "content": [{"type": "image", "image": image_url}, {"type": "text", "text": "Describe this image."}]}]
inputs = self.prepare_inputs(messages)
output_text = self.generate_output(inputs)
return output_text
def identify_similarities(self, image_paths):
content = [{"type": "image", "image": path} for path in image_paths]
content.append({"type": "text", "text": "Identify the similarities between these images."})
messages = [{"role": "user", "content": content}]
inputs = self.prepare_inputs(messages)
output_text = self.generate_output(inputs)
return output_text
def describe_video(self, video_path):
messages = [{"role": "user", "content": [{"type": "video", "video": video_path, 'max_pixels': 360*420, 'fps': 1.0}, {"type": "text", "text": "Describe this video."}]}]
inputs = self.prepare_inputs(messages)
output_text = self.generate_output(inputs)
return output_text
# Usage example
model_dir = "Qwen2-VL-7B-Instruct"
min_pixels = 256*28*28
max_pixels = 1280*28*28
vl_model = VisionLanguageModel(model_dir, min_pixels, max_pixels)
# Describe an image
image_description = vl_model.describe_image("test.jpeg")
# Identify similarities between images
image_paths = ["image1.jpg", "image2.jpg"]
image_similarities = vl_model.identify_similarities(image_paths)
# Describe a video
video_description = vl_model.describe_video("video1.mp4")
import copy
import re
import gc
from argparse import ArgumentParser
from threading import Thread
import gradio as gr
import torch
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration, TextIteratorStreamer
DEFAULT_CKPT_PATH = 'Qwen2-VL-7B-Instruct'
def _get_args():
"""Parse command line arguments."""
parser = ArgumentParser(description="Qwen2-VL WebUI Options.")
parser.add_argument('-c', '--checkpoint-path', type=str, default=DEFAULT_CKPT_PATH,
help='Checkpoint name or path, default to %(default)r')
parser.add_argument('--cpu-only', action='store_true', help='Run demo with CPU only')
parser.add_argument('--flash-attn2', action='store_true', default=False,
help='Enable flash_attention_2 when loading the model.')
parser.add_argument('--share', action='store_true', default=False,
help='Create a publicly shareable link for the interface.')
parser.add_argument('--inbrowser', action='store_true', default=False,
help='Automatically launch the interface in a new tab on the default browser.')
parser.add_argument('--server-port', type=int, default=7870, help='Demo server port.')
parser.add_argument('--server-name', type=str, default='', help='Demo server name.')
return parser.parse_args()
def _load_model_processor(args):
"""Load model and processor based on provided arguments."""
device_map = 'cpu' if args.cpu_only else 'auto'
model_load_args = {'torch_dtype': 'auto', 'device_map': device_map}
if args.flash_attn2:
model_load_args['attn_implementation'] = 'flash_attention_2'
model = Qwen2VLForConditionalGeneration.from_pretrained(args.checkpoint_path, **model_load_args)
processor = AutoProcessor.from_pretrained(args.checkpoint_path)
return model, processor
def _parse_text(text):
"""Parse markdown-styled text to HTML."""
def html_escape(text):
"""Escape HTML special characters."""
html_escape_table = {
"`": r'\`', "<": "<", ">": ">", " ": " ", "*": "*", "_": "_",
"-": "-", ".": ".", "!": "!", "(": "(", ")": ")", "$": "$"
return "".join(html_escape_table.get(c, c) for c in text)
lines = filter(bool, text.split('\n'))
inside_code = False
for i, line in enumerate(lines):
if '```' in line:
inside_code = not inside_code
items = line.split('`')
lines[i] = f'<pre><code class="language-{items[-1]}">' if inside_code else '<br></code></pre>'
elif inside_code:
lines[i] = html_escape(line)
lines[i] = '<br>' + line
return ''.join(lines)
def _remove_image_special(text):
"""Remove image-related special tags from the text."""
text = text.replace('<ref>', '').replace('</ref>', '')
return re.sub(r'<box>.*?(</box>|$)', '', text)
def _is_video_file(filename):
"""Check if the given filename is a video file."""
video_extensions = {'.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.mpeg'}
return any(filename.lower().endswith(ext) for ext in video_extensions)
def _gc():
"""Perform garbage collection and empty CUDA cache if available."""
if torch.cuda.is_available():
def _transform_messages(original_messages):
"""Transform original message structure to suitable format for processing."""
def get_content_type(content_item):
"""Identify the type of content item."""
if 'image' in content_item:
return {'type': 'image', 'image': content_item['image']}
elif 'text' in content_item:
return {'type': 'text', 'text': content_item['text']}
elif 'video' in content_item:
return {'type': 'video', 'video': content_item['video']}
return None
return [
{'role': message['role'], 'content': [get_content_type(item) for item in message['content'] if get_content_type(item)]}
for message in original_messages
def _launch_demo(args, model, processor):
"""Launch the Gradio demo interface."""
def call_local_model(model, processor, messages):
"""Handle the interaction with the local model."""
messages = _transform_messages(messages)
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors='pt')
inputs = inputs.to(model.device)
tokenizer = processor.tokenizer
streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
gen_kwargs = {'max_new_tokens': 512, 'streamer': streamer, **inputs}
thread = Thread(target=model.generate, kwargs=gen_kwargs)
generated_text = ''
for new_text in streamer:
generated_text += new_text
yield generated_text
def create_predict_fn():
def predict(_chatbot, task_history):
"""Generate responses based on chat and task history."""
nonlocal model, processor
chat_query = _chatbot[-1][0]
query = task_history[-1][0]
if len(chat_query) == 0:
return _chatbot
print('User: ' + _parse_text(query))
full_response, messages = '', []
for q, a in copy.deepcopy(task_history):
content = []
if isinstance(q, (tuple, list)):
file_type = 'video' if _is_video_file(q[0]) else 'image'
content.append({file_type: f'file://{q[0]}'})
content.append({'text': q})
messages.extend([{'role': 'user', 'content': content}, {'role': 'assistant', 'content': [{'text': a}]}])
if messages:
for response in call_local_model(model, processor, messages):
_chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response)))
yield _chatbot
full_response = _parse_text(response)
task_history[-1] = (query, full_response)
print('Qwen-VL-Chat: ' + full_response)
yield _chatbot
return predict
def create_regenerate_fn():
def regenerate(_chatbot, task_history):
"""Regenerate the last response."""
nonlocal model, processor
if not task_history:
return _chatbot
query, last_response = task_history[-1]
if last_response is None:
return _chatbot
task_history[-1] = (query, None)
last_q, _ = _chatbot[-1]
_chatbot[-1] = (last_q, None)
for _chatbot in predict(_chatbot, task_history):
yield _chatbot
return regenerate
def add_text(history, task_history, text):
"""Add text input to histories and reset the input box."""
task_text = text
history = history or []
task_history = task_history or []
history.append((_parse_text(text), None))
task_history.append((task_text, None))
return history, task_history, ''
def add_file(history, task_history, file):
"""Add file input to histories."""
history = history or []
task_history = task_history or []
history.append(((file.name,), None))
task_history.append(((file.name,), None))
return history, task_history
def reset_user_input():
"""Reset the user input box."""
return gr.update(value='')
def reset_state(_chatbot, task_history):
"""Clear history and perform garbage collection."""
return []
with gr.Blocks() as demo:
<p align="center"><img src="https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png" style="height: 80px"/></p>
<center><font size=8>Qwen2-VL</center>
<center><font size=3>This WebUI is based on Qwen2-VL, developed by Alibaba Cloud.</center>
<center><font size=3>本WebUI基于Qwen2-VL。</center>
chatbot = gr.Chatbot(label='Qwen2-VL', elem_classes='control-height', height=500)
query = gr.Textbox(lines=2, label='Input')
task_history = gr.State([])
with gr.Row():
addfile_btn = gr.UploadButton('📁 Upload (上传文件)', file_types=['image', 'video'])
submit_btn = gr.Button('🚀 Submit (发送)')
regen_btn = gr.Button('🤔️ Regenerate (重试)')
empty_btn = gr.Button('🧹 Clear History (清除历史)')
submit_btn.click(add_text, [chatbot, task_history, query], [chatbot, task_history]).then(
create_predict_fn(), [chatbot, task_history], [chatbot], show_progress=True
submit_btn.click(reset_user_input, [], [query])
empty_btn.click(reset_state, [chatbot, task_history], [chatbot], show_progress=True)
regen_btn.click(create_regenerate_fn(), [chatbot, task_history], [chatbot], show_progress=True)
addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)
<font size=2>Note: This demo is governed by the original license of Qwen2-VL. \
We strongly advise users not to knowingly generate or allow others to knowingly generate harmful content, \
including hate speech, violence, pornography, deception, etc. \
def main():
"""Main function to parse arguments, load the model and processor, and launch the demo."""
args = _get_args()
model, processor = _load_model_processor(args)
_launch_demo(args, model, processor)
if __name__ == '__main__':