【深度学习】Chinese-CLIP 使用教程，图文检索，跨模态检索，零样本图片分类

代码：https://github.com/OFA-Sys/Chinese-CLIP/blob/master/deployment.md

文章目录

安装环境和onnx推理
转换所有模型为onnx
测试所有onnx模型的脚本
onnx cpu方式执行
docker镜像

安装环境和onnx推理

安装环境，下载权重放置到指定目录，进行onnx转换：

conda create -n clip_cn python=3.8 -y
conda activate clip_cn
conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.6 -c pytorch -c conda-forge -y
conda install numpy tqdm six -y
conda install -c conda-forge timm -y
conda install -c anaconda lmdb==1.3.0 -y

cd Chinese-CLIP
pip install -e .
mkdir DATAPATH
export CUDA_VISIBLE_DEVICES=0
export PYTHONPATH=${PYTHONPATH}:`pwd`/cn_clip
export DATAPATH="./DATAPATH"
checkpoint_path=${DATAPATH}/pretrained_weights/clip_cn_vit-b-16.pt # 指定要转换的ckpt完整路径
echo checkpoint_path
echo $checkpoint_path
mkdir -p ${DATAPATH}/deploy/ # 创建ONNX模型的输出文件夹
pip install onnx onnxruntime-gpu onnxmltools
python cn_clip/deploy/pytorch_to_onnx.py \
       --model-arch ViT-B-16 \
       --pytorch-ckpt-path ${checkpoint_path} \
       --save-onnx-path ${DATAPATH}/deploy/vit-b-16 \
       --convert-text --convert-vision

转换成功后：

Finished PyTorch to ONNX conversion...
>>> The text FP32 ONNX model is saved at ./DATAPATH/deploy/vit-b-16.txt.fp32.onnx
>>> The text FP16 ONNX model is saved at ./DATAPATH/deploy/vit-b-16.txt.fp16.onnx with extra file ./DATAPATH/deploy/vit-b-16.txt.fp16.onnx.extra_file
>>> The vision FP32 ONNX model is saved at ./DATAPATH/deploy/vit-b-16.img.fp32.onnx
>>> The vision FP16 ONNX model is saved at ./DATAPATH/deploy/vit-b-16.img.fp16.onnx with extra file ./DATAPATH/deploy/vit-b-16.img.fp16.onnx.extra_file

可以看到转换好的文件：
在这里插入图片描述

测试是否转换成功的python代码：

# 完成必要的import（下文省略）
import onnxruntime
from PIL import Image
import numpy as np
import torch
import argparse
import cn_clip.clip as clip
from clip import load_from_name, available_models
from clip.utils import _MODELS, _MODEL_INFO, _download, available_models, create_model, image_transform

# 载入ONNX图像侧模型（**请替换${DATAPATH}为实际的路径**）
img_sess_options = onnxruntime.SessionOptions()
img_run_options = onnxruntime.RunOptions()
img_run_options.log_severity_level = 2
img_onnx_model_path="./DATAPATH/deploy/vit-b-16.img.fp16.onnx"
img_session = onnxruntime.InferenceSession(img_onnx_model_path,
                                        sess_options=img_sess_options,
                                        providers=["CUDAExecutionProvider"])

# 预处理图片
model_arch = "ViT-B-16" # 这里我们使用的是ViT-B-16规模，其他规模请对应修改
preprocess = image_transform(_MODEL_INFO[model_arch]['input_resolution'])
# 示例皮卡丘图片，预处理后得到[1, 3, 分辨率, 分辨率]尺寸的Torch Tensor
image = preprocess(Image.open("examples/pokemon.jpeg")).unsqueeze(0)

# 用ONNX模型计算图像侧特征
image_features = img_session.run(["unnorm_image_features"], {"image": image.cpu().numpy()})[0] # 未归一化的图像特征
image_features = torch.tensor(image_features)
image_features /= image_features.norm(dim=-1, keepdim=True) # 归一化后的Chinese-CLIP图像特征，用于下游任务
print(image_features.shape) # Torch Tensor shape: [1, 特征向量维度]

# 载入ONNX文本侧模型（**请替换${DATAPATH}为实际的路径**）
txt_sess_options = onnxruntime.SessionOptions()
txt_run_options = onnxruntime.RunOptions()
txt_run_options.log_severity_level = 2
txt_onnx_model_path="./DATAPATH/deploy/vit-b-16.txt.fp16.onnx"
txt_session = onnxruntime.InferenceSession(txt_onnx_model_path,
                                        sess_options=txt_sess_options,
                                        providers=["CUDAExecutionProvider"])

# 为4条输入文本进行分词。序列长度指定为52，需要和转换ONNX模型时保持一致（参见转换时的context-length参数）
text = clip.tokenize(["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], context_length=52)

# 用ONNX模型依次计算文本侧特征
text_features = []
for i in range(len(text)):
    one_text = np.expand_dims(text[i].cpu().numpy(),axis=0)
    text_feature = txt_session.run(["unnorm_text_features"], {"text":one_text})[0] # 未归一化的文本特征
    text_feature = torch.tensor(text_feature)
    text_features.append(text_feature)
text_features = torch.squeeze(torch.stack(text_features),dim=1) # 4个特征向量stack到一起
text_features = text_features / text_features.norm(dim=1, keepdim=True) # 归一化后的Chinese-CLIP文本特征，用于下游任务
print(text_features.shape) # Torch Tensor shape: [4, 特征向量维度]


# 内积后softmax
# 注意在内积计算时，由于对比学习训练时有temperature的概念
# 需要乘上模型logit_scale.exp()，我们的预训练模型logit_scale均为4.6052，所以这里乘以100
# 对于用户自己的ckpt，请使用torch.load载入后，查看ckpt['state_dict']['module.logit_scale']或ckpt['state_dict']['logit_scale']
logits_per_image = 100 * image_features @ text_features.t()
print(logits_per_image.softmax(dim=-1)) # 图文相似概率: [[1.2252e-03, 5.2874e-02, 6.7116e-04, 9.4523e-01]]

转换所有模型为onnx

在这里插入图片描述

# ["RN50", "ViT-B-16", "ViT-L-14", "ViT-L-14-336", "ViT-H-14"]
checkpoint_path=${DATAPATH}/pretrained_weights/clip_cn_rn50.pt
python cn_clip/deploy/pytorch_to_onnx.py \
       --model-arch RN50 \
       --pytorch-ckpt-path ${checkpoint_path} \
       --save-onnx-path ${DATAPATH}/deploy/RN50 \
       --convert-text --convert-vision

checkpoint_path=${DATAPATH}/pretrained_weights/clip_cn_vit-b-16.pt
python cn_clip/deploy/pytorch_to_onnx.py \
       --model-arch ViT-B-16 \
       --pytorch-ckpt-path ${checkpoint_path} \
       --save-onnx-path ${DATAPATH}/deploy/ViT-B-16 \
       --convert-text --convert-vision
       
checkpoint_path=${DATAPATH}/pretrained_weights/clip_cn_vit-l-14.pt
python cn_clip/deploy/pytorch_to_onnx.py \
       --model-arch "ViT-L-14"\
       --pytorch-ckpt-path ${checkpoint_path} \
       --save-onnx-path ${DATAPATH}/deploy/ViT-L-14 \
       --convert-text --convert-vision

checkpoint_path=${DATAPATH}/pretrained_weights/clip_cn_vit-l-14-336.pt
python cn_clip/deploy/pytorch_to_onnx.py \
       --model-arch "ViT-L-14-336"\
       --pytorch-ckpt-path ${checkpoint_path} \
       --save-onnx-path ${DATAPATH}/deploy/ViT-L-14-336 \
       --convert-text --convert-vision

checkpoint_path=${DATAPATH}/pretrained_weights/clip_cn_vit-h-14.pt
python cn_clip/deploy/pytorch_to_onnx.py \
       --model-arch "ViT-H-14"\
       --pytorch-ckpt-path ${checkpoint_path} \
       --save-onnx-path ${DATAPATH}/deploy/ViT-H-14 \
       --convert-text --convert-vision

测试所有onnx模型的脚本

# 完成必要的import（下文省略）
import onnxruntime
from PIL import Image
import numpy as np
import torch
import argparse
import cn_clip.clip as clip
from clip import load_from_name, available_models
from clip.utils import _MODELS, _MODEL_INFO, _download, available_models, create_model, image_transform

model_list=["RN50", "ViT-B-16", "ViT-L-14", "ViT-L-14-336", "ViT-H-14"]

for model_name in model_list:
    print(model_name)
    # 载入ONNX图像侧模型（**请替换${DATAPATH}为实际的路径**）
    img_sess_options = onnxruntime.SessionOptions()
    img_run_options = onnxruntime.RunOptions()
    img_run_options.log_severity_level = 2
    img_onnx_model_path=f"./DATAPATH/deploy/{model_name}.img.fp16.onnx"
    img_session = onnxruntime.InferenceSession(img_onnx_model_path,
                                            sess_options=img_sess_options,
                                            providers=["CUDAExecutionProvider"])

    # 预处理图片
    model_arch = model_name # 这里我们使用的是ViT-B-16规模，其他规模请对应修改
    preprocess = image_transform(_MODEL_INFO[model_arch]['input_resolution'])
    # 示例皮卡丘图片，预处理后得到[1, 3, 分辨率, 分辨率]尺寸的Torch Tensor
    image = preprocess(Image.open("examples/pokemon.jpeg")).unsqueeze(0)

    # 用ONNX模型计算图像侧特征
    image_features = img_session.run(["unnorm_image_features"], {"image": image.cpu().numpy()})[0] # 未归一化的图像特征
    image_features = torch.tensor(image_features)
    image_features /= image_features.norm(dim=-1, keepdim=True) # 归一化后的Chinese-CLIP图像特征，用于下游任务
    print(image_features.shape) # Torch Tensor shape: [1, 特征向量维度]

    # 载入ONNX文本侧模型（**请替换${DATAPATH}为实际的路径**）
    txt_sess_options = onnxruntime.SessionOptions()
    txt_run_options = onnxruntime.RunOptions()
    txt_run_options.log_severity_level = 2
    txt_onnx_model_path=f"./DATAPATH/deploy/{model_name}.txt.fp16.onnx"
    txt_session = onnxruntime.InferenceSession(txt_onnx_model_path,
                                            sess_options=txt_sess_options,
                                            providers=["CUDAExecutionProvider"])

    # 为4条输入文本进行分词。序列长度指定为52，需要和转换ONNX模型时保持一致（参见转换时的context-length参数）
    text = clip.tokenize(["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], context_length=52)

    # 用ONNX模型依次计算文本侧特征
    text_features = []
    for i in range(len(text)):
        one_text = np.expand_dims(text[i].cpu().numpy(),axis=0)
        text_feature = txt_session.run(["unnorm_text_features"], {"text":one_text})[0] # 未归一化的文本特征
        text_feature = torch.tensor(text_feature)
        text_features.append(text_feature)
    text_features = torch.squeeze(torch.stack(text_features),dim=1) # 4个特征向量stack到一起
    text_features = text_features / text_features.norm(dim=1, keepdim=True) # 归一化后的Chinese-CLIP文本特征，用于下游任务
    print(text_features.shape) # Torch Tensor shape: [4, 特征向量维度]


    # 内积后softmax
    # 注意在内积计算时，由于对比学习训练时有temperature的概念
    # 需要乘上模型logit_scale.exp()，我们的预训练模型logit_scale均为4.6052，所以这里乘以100
    # 对于用户自己的ckpt，请使用torch.load载入后，查看ckpt['state_dict']['module.logit_scale']或ckpt['state_dict']['logit_scale']
    logits_per_image = 100 * image_features @ text_features.t()
    print(logits_per_image.softmax(dim=-1)) # 图文相似概率: [[1.2252e-03, 5.2874e-02, 6.7116e-04, 9.4523e-01]]

onnx cpu方式执行

环境：

conda create -n clip_cn_cpu python=3.8 -y
conda activate clip_cn_cpu
conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cpuonly -c pytorch -y
conda install numpy tqdm six -y
conda install -c conda-forge timm -y
conda install -c anaconda lmdb==1.3.0 -y
cd Chinese-CLIP
pip install -e .
export PYTHONPATH=${PYTHONPATH}:`pwd`/cn_clip
pip install onnx onnxruntime onnxmltools

测试脚本：

# 完成必要的import（下文省略）
import onnxruntime
from PIL import Image
import numpy as np
import torch
import argparse
import cn_clip.clip as clip
from clip import load_from_name, available_models
from clip.utils import _MODELS, _MODEL_INFO, _download, available_models, create_model, image_transform

model_list=["RN50", "ViT-B-16", "ViT-L-14", "ViT-L-14-336", "ViT-H-14"]

for model_name in model_list:
    print(model_name)
    # 载入ONNX图像侧模型（**请替换${DATAPATH}为实际的路径**）
    img_sess_options = onnxruntime.SessionOptions()
    img_run_options = onnxruntime.RunOptions()
    img_run_options.log_severity_level = 2
    img_onnx_model_path=f"./DATAPATH/deploy/{model_name}.img.fp16.onnx"
    img_session = onnxruntime.InferenceSession(img_onnx_model_path,
                                            sess_options=img_sess_options,
                                            providers=["CPUExecutionProvider"])

    # 预处理图片
    model_arch = model_name # 这里我们使用的是ViT-B-16规模，其他规模请对应修改
    preprocess = image_transform(_MODEL_INFO[model_arch]['input_resolution'])
    # 示例皮卡丘图片，预处理后得到[1, 3, 分辨率, 分辨率]尺寸的Torch Tensor
    image = preprocess(Image.open("examples/pokemon.jpeg")).unsqueeze(0)

    # 用ONNX模型计算图像侧特征
    image_features = img_session.run(["unnorm_image_features"], {"image": image.cpu().numpy()})[0] # 未归一化的图像特征
    image_features = torch.tensor(image_features)
    image_features /= image_features.norm(dim=-1, keepdim=True) # 归一化后的Chinese-CLIP图像特征，用于下游任务
    print(image_features.shape) # Torch Tensor shape: [1, 特征向量维度]

    # 载入ONNX文本侧模型（**请替换${DATAPATH}为实际的路径**）
    txt_sess_options = onnxruntime.SessionOptions()
    txt_run_options = onnxruntime.RunOptions()
    txt_run_options.log_severity_level = 2
    txt_onnx_model_path=f"./DATAPATH/deploy/{model_name}.txt.fp16.onnx"
    txt_session = onnxruntime.InferenceSession(txt_onnx_model_path,
                                            sess_options=txt_sess_options,
                                            providers=["CPUExecutionProvider"])

    # 为4条输入文本进行分词。序列长度指定为52，需要和转换ONNX模型时保持一致（参见转换时的context-length参数）
    text = clip.tokenize(["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], context_length=52)

    # 用ONNX模型依次计算文本侧特征
    text_features = []
    for i in range(len(text)):
        one_text = np.expand_dims(text[i].cpu().numpy(),axis=0)
        text_feature = txt_session.run(["unnorm_text_features"], {"text":one_text})[0] # 未归一化的文本特征
        text_feature = torch.tensor(text_feature)
        text_features.append(text_feature)
    text_features = torch.squeeze(torch.stack(text_features),dim=1) # 4个特征向量stack到一起
    text_features = text_features / text_features.norm(dim=1, keepdim=True) # 归一化后的Chinese-CLIP文本特征，用于下游任务
    print(text_features.shape) # Torch Tensor shape: [4, 特征向量维度]


    # 内积后softmax
    # 注意在内积计算时，由于对比学习训练时有temperature的概念
    # 需要乘上模型logit_scale.exp()，我们的预训练模型logit_scale均为4.6052，所以这里乘以100
    # 对于用户自己的ckpt，请使用torch.load载入后，查看ckpt['state_dict']['module.logit_scale']或ckpt['state_dict']['logit_scale']
    logits_per_image = 100 * image_features @ text_features.t()
    print(logits_per_image.softmax(dim=-1)) # 图文相似概率: [[1.2252e-03, 5.2874e-02, 6.7116e-04, 9.4523e-01]]

docker镜像

dockerfile

FROM ubuntu:20.04
RUN apt-get update && apt-get install -y wget curl
RUN wget http://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/conda.sh && \
    bash /tmp/conda.sh -b && rm /tmp/conda.sh
WORKDIR /Chinese-CLIP/
COPY ./ /Chinese-CLIP/
RUN cp ./deps/.condarc /root/
ENV DEBIAN_FRONTEND=noninteractive
# 安装 tzdata 包并设置时区为上海（无交互）
RUN apt-get update && \
    apt-get install -y tzdata && \
    ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
    echo "Asia/Shanghai" > /etc/timezone
RUN ~/miniconda3/bin/conda init bash && . ~/.bashrc

RUN apt-get install -y python3-pip

RUN . ~/.bashrc && pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple


RUN . ~/.bashrc && conda create -n clip_cn_cpu python=3.8 -y &&  \
    conda activate clip_cn_cpu &&  \
    conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cpuonly -c pytorch -y
RUN . ~/.bashrc && conda activate clip_cn_cpu && conda install numpy tqdm six -y
RUN . ~/.bashrc && conda activate clip_cn_cpu && conda install -c conda-forge timm -y
RUN . ~/.bashrc && conda activate clip_cn_cpu && pip install  lmdb==1.3.0
RUN . ~/.bashrc && conda activate clip_cn_cpu && pip install onnx onnxruntime onnxmltools fastapi uvicorn python-multipart
RUN . ~/.bashrc && conda activate clip_cn_cpu &&  cd /Chinese-CLIP && pip install -e .
ENV PYTHONPATH=${PYTHONPATH}:/Chinese-CLIP/cn_clip
COPY ./main_clip_cn.py /Chinese-CLIP/main_clip_cn.py
COPY ./deps/startProject.sh /Chinese-CLIP/deps/startProject.sh
EXPOSE 7863
ENTRYPOINT ["bash", "/Chinese-CLIP/deps/startProject.sh"]

1号进程：

#!/bin/bash
set -u
cd /Chinese-CLIP/ && nohup /root/miniconda3/envs/clip_cn_cpu/bin/python main_clip_cn.py >/log.alg  2>&1 &
echo "----------------------------------------"
echo "running..."
while [ 1 ]
do
    echo "sleep"
    sleep 3600
done

启动容器服务：

docker run -d -p7863:7863 kevinchina/deeplearning:clip_chinese

访问接口：

import requests
import torch
# 模型可选 ["RN50", "ViT-B-16", "ViT-L-14", "ViT-L-14-336", "ViT-H-14"]
url = "http://home.elvisiky.com:7863/cal_image_feature"
with open("no_text.png", "rb") as f:
    files = {"fileb": ("no_text.png", f)}  # 将文件字段键设置为 "fileb"
    data = {"model_name": "ViT-B-16"}  # 传递 model_name 作为表单数据
    res1 = requests.post(url=url, files=files, data=data, timeout=None)
    print(res1.json())

url = "http://home.elvisiky.com:7863/cal_text_feature"
data = {"text": "夕阳", "model_name": "ViT-B-16"}  # 传递 model_name 作为表单数据
res2 = requests.post(url=url, data=data, timeout=None)
print(res2.json())

data = {"text": "怪兽", "model_name": "ViT-B-16"}  # 传递 model_name 作为表单数据
res3 = requests.post(url=url, data=data, timeout=None)
print(res3.json())

image_vector = res1.json()["data"]
text_vector = res2.json()["data"] + res3.json()["data"]
vector1 = torch.tensor(image_vector)
vector2 = torch.tensor(text_vector)
inner = vector1 @ vector2.t()
print("一张图片和多个文本的内积", inner)
logits_per_image = 100 * inner
print("换算成softmax概率", logits_per_image.softmax(dim=-1))