1. 物体检测模型:YOLOv8s
物体检测是计算机视觉领域的核心问题之一,它涉及识别图像中的物体并精确地定位它们的位置。YOLO(You Only Look Once)技术自2016年由Redmon等人首次提出以来,就以其高效的处理速度和优秀的性能引领了物体检测领域。YOLO的核心创新是将物体检测任务视为一个端到端的回归问题,通过单次前向传播,同时预测图像中多个边界框和相应的类别概率。YOLOv8s是YOLO系列中的最新迭代,继承并增强了YOLO体系的主要优点,如实时性能和高准确率。YOLOv8s进一步优化了模型架构和训练过程,使用深度卷积神经网络一次性分析整个图像,从而预测物体的类别和位置。这种端到端的训练方式不仅简化了训练过程,还增强了模型对不同尺寸物体的泛化能力。
2. 图像处理过程
图像处理流程从使用PIL(Python Imaging Library)库加载图像开始,这是一个广泛应用于Python中的图像处理库。图像加载后,即被输入到YOLOv8s模型进行推断处理。模型输出的结果包括每个检测到的物体的类别、置信度和边界框坐标。基于这些信息,在原始图像上绘制边界框和相应的标注信息,如物体的类别和置信度,这样的视觉呈现有助于我们直观地理解和评估模型的检测效果。
- CLIP模型技术介绍
CLIP(Contrastive Language-Image Pre-training)是由OpenAI开发的一种革命性多模态学习框架,设计目的是为了深入理解图像与文本之间的语义联系。此框架基于大规模对比学习的原则,通过并行训练图像编码器和文本编码器来识别图像与其描述之间的匹配关系,显著提升了模型对视觉任务的泛化能力。CLIP模型的核心优势在于其独特的训练方法,该方法采用海量的图像-文本对作为训练数据,通过优化图像和文本表示之间的相似度进行训练。这种策略使得CLIP不仅能理解广泛的视觉概念,还能将这些概念与自然语言有效结合,从而实现深层次的语义理解。
- 图像与文本的预处理与特征提取
- 相似度计算与分析
from ultralytics import YOLO
import os
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import torch
import matplotlib.pyplot as plt
import clip
# Load a pretrained YOLOv8s model
model = YOLO('yolov8s.pt')
# Load an image with PIL
original_image = Image.open('img/img1.jpg')
# Run inference on an image
results = model(original_image)
# Create a copy of the original image to draw bounding boxes
draw_image = original_image.copy()
draw = ImageDraw.Draw(draw_image)
# 类别映射
class_names = {
0: "person", 1: "bicycle", 2: "car", 3: "motorcycle", 4: "airplane",
5: "bus", 6: "train", 7: "truck", 8: "boat", 9: "traffic light",
10: "fire hydrant", 11: "stop sign", 12: "parking meter", 13: "bench",
14: "bird", 15: "cat", 16: "dog", 17: "horse", 18: "sheep", 19: "cow",
20: "elephant", 21: "bear", 22: "zebra", 23: "giraffe", 24: "backpack",
25: "umbrella", 26: "handbag", 27: "tie", 28: "suitcase", 29: "frisbee",
30: "skis", 31: "snowboard", 32: "sports ball", 33: "kite", 34: "baseball bat",
35: "baseball glove", 36: "skateboard", 37: "surfboard", 38: "tennis racket",
39: "bottle", 40: "wine glass", 41: "cup", 42: "fork", 43: "knife",
44: "spoon", 45: "bowl", 46: "banana", 47: "apple", 48: "sandwich",
49: "orange", 50: "broccoli", 51: "carrot", 52: "hot dog", 53: "pizza",
54: "donut", 55: "cake", 56: "chair", 57: "couch", 58: "potted plant",
59: "bed", 60: "dining table", 61: "toilet", 62: "tv", 63: "laptop",
64: "mouse", 65: "remote", 66: "keyboard", 67: "cell phone", 68: "microwave",
69: "oven", 70: "toaster", 71: "sink", 72: "refrigerator", 73: "book",
74: "clock", 75: "vase", 76: "scissors", 77: "teddy bear", 78: "hair drier",
79: "toothbrush"
# Process results list
for result in results:
boxes = result.boxes # Boxes object for bbox outputs
cls = boxes.cls
conf = boxes.conf
xyxyn = boxes.xyxyn
# Convert tensor to numpy array and move data to CPU
cls_numpy = cls.cpu().numpy()
conf_numpy = conf.cpu().numpy()
xyxyn_numpy = xyxyn.cpu().numpy()
# Iterate over each detection
for i in range(len(cls_numpy)):
# Convert normalized coordinates to image coordinates
box = xyxyn_numpy[i]
xmin, ymin, xmax, ymax = box[0] * original_image.width, box[1] * original_image.height, box[2] * original_image.width, box[3] * original_image.height
# Draw the bounding box
draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=2)
# Get class label from cls number using the mapping
class_label = class_names.get(int(cls_numpy[i]), 'Unknown')
# Prepare text with class and confidence
label = f'{class_label}: {conf_numpy[i]:.2f}'
# Draw the class and confidence text
draw.text((xmin, ymin), label, fill="white")
# # Save the annotated image
# 定义一个函数来裁剪图像
def crop_image(original_image, box):
x1, y1, x2, y2 = map(int, box)
return original_image.crop((x1, y1, x2, y2))
# 创建一个目录来保存裁剪的图像,如果该目录不存在的话
os.makedirs('cropped_images', exist_ok=True)
# Process results list
for result in results:
boxes = result.boxes # Boxes object for bbox outputs
cls = boxes.cls
conf = boxes.conf
xyxyn = boxes.xyxyn
# Convert tensor to numpy array and move data to CPU
cls_numpy = cls.cpu().numpy()
conf_numpy = conf.cpu().numpy()
xyxyn_numpy = xyxyn.cpu().numpy()
# Iterate over each detection
for i in range(len(cls_numpy)):
# Convert normalized coordinates to image coordinates
box = xyxyn_numpy[i]
xmin, ymin, xmax, ymax = box[0] * original_image.width, box[1] * original_image.height, box[
2] * original_image.width, box[3] * original_image.height
# Draw the bounding box
draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=2)
# Get class label from cls number using the mapping
class_label = class_names.get(int(cls_numpy[i]), 'Unknown')
# Prepare text with class and confidence
label = f'{class_label}: {conf_numpy[i]:.2f}'
# Draw the class and confidence text
draw.text((xmin, ymin), label, fill="white")
# Crop the image around the bounding box and save it
cropped_img = crop_image(original_image, (xmin, ymin, xmax, ymax))
cropped_img_path = os.path.join('cropped_images', f'{class_label}_{i}_{conf_numpy[i]:.2f}.jpg')
# Load the CLIP model
model, preprocess = clip.load("ViT-B/32", device='cuda')
# Prepare your images and texts
your_image_folder = "cropped_images" # Change to the folder where you stored cropped images
your_texts = ["Drink water"] # Replace with your list of texts
images = []
for filename in os.listdir(your_image_folder):
if filename.endswith(".png") or filename.endswith(".jpg"):
path = os.path.join(your_image_folder, filename)
image = Image.open(path).convert("RGB")
# Image and text preprocessing
image_input = torch.tensor(np.stack(images)).cuda()
text_tokens = clip.tokenize(your_texts).cuda()
# Compute features
with torch.no_grad():
image_features = model.encode_image(image_input).float()
text_features = model.encode_text(text_tokens).float()
# Normalize the features
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
print('image_features:', image_features)
print('text_features:', text_features)
print('image_features_shape:', image_features.shape)
print('text_features_shape:', text_features.shape)
# Calculate similarity
similarity = (text_features.cpu().numpy() @ image_features.cpu().numpy().T)
# Print similarity scores
print('Similarity:', similarity)