简介:
调用本地摄像头,通过多模态大语言模型实时感知世界,并进行交互
界面:
代码:
import tkinter as tk
from tkinter import ttk
from PIL import Image, ImageTk
import cv2
import requests
# 定义处理函数
def capture_and_send():
# 从摄像头读取当前帧
ret, frame = cap.read()
if ret:
# 将帧转换为图像格式
_, buffer = cv2.imencode('.jpg', frame)
image_data = buffer.tobytes()
# 获取用户输入的 prompt
prompt = prompt_entry.get()
# 构建请求数据
files = {'image': ('image.jpg', image_data, 'image/jpeg')}
data = {'prompt': prompt}
# 发送 POST 请求到多模态大语言模型的服务
response = requests.post('http://10.136.22.140:7860/qwenvl2', files=files, data=data)
# 将响应显示在结果文本框中
if response.status_code == 200:
result = response.json()['text']
result_entry.delete(1.0, tk.END)
result_entry.insert(tk.END, result)
else:
result_entry.delete(1.0, tk.END)
result_entry.insert(tk.END, "Error: " + str(response.status_code))
def clear_text():
# 清空两个文本框的内容
prompt_entry.delete(0, tk.END)
result_entry.delete(1.0, tk.END)
def update_frame():
ret, frame = cap.read()
if ret:
# 将图像转换为适合在 Tkinter 中显示的格式
cv2image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
img = Image.fromarray(cv2image)
imgtk = ImageTk.PhotoImage(image=img)
camera_label.imgtk = imgtk
camera_label.configure(image=imgtk)
# 每 10 毫秒更新一次
camera_label.after(10, update_frame)
# 创建主窗口
root = tk.Tk()
root.title("Multi-modal AI Interface")
root.geometry("900x600") # 设置窗口大小
# 样式配置
style = ttk.Style()
style.configure("TButton", font=("Arial", 12), padding=10)
style.configure("TLabel", font=("Arial", 12))
style.configure("TEntry", font=("Arial", 12))
style.configure("TText", font=("Arial", 12))
# 摄像头画面区域
camera_label = ttk.Label(root)
camera_label.grid(row=0, column=0, rowspan=6, padx=10, pady=10, sticky="nsew")
# Prompt 输入框
prompt_label = ttk.Label(root, text="Enter prompt:")
prompt_label.grid(row=0, column=1, padx=10, pady=5, sticky="w")
prompt_entry = ttk.Entry(root, width=50)
prompt_entry.grid(row=1, column=1, padx=10, pady=5, sticky="ew")
# 结果显示框
result_label = ttk.Label(root, text="Model Output:")
result_label.grid(row=2, column=1, padx=10, pady=5, sticky="w")
result_entry = tk.Text(root, width=50, height=15)
result_entry.grid(row=3, column=1, padx=10, pady=5, sticky="ew")
# 发送按钮
send_button = ttk.Button(root, text="Send to Model", command=capture_and_send)
send_button.grid(row=4, column=1, padx=10, pady=5, sticky="ew")
# 清空按钮
clear_button = ttk.Button(root, text="Clear Text", command=clear_text)
clear_button.grid(row=5, column=1, padx=10, pady=5, sticky="ew")
# 打开摄像头
cap = cv2.VideoCapture(0)
# 开始摄像头实时更新
update_frame()
# 运行主循环
root.mainloop()
# 释放摄像头
cap.release()
cv2.destroyAllWindows()