onnx-web + yolov8n 在视频流里做推理

news2025/7/13 7:00:57

顺着我上一篇文章使用onnxruntime-web 运行yolov8-nano推理继续说，有朋友在问能不能接入

视频流动，实时去识别物品。

首先使用 getUserMedia 获取摄像头视频流

getUserMedia API 可以访问设备的摄像头和麦克风。你可以使用这个 API 获取视频流，并将其显示在页面上的 <video> 标签中。

注意事项：

浏览器支持：getUserMedia 被现代浏览器大多数支持，但在一些旧版浏览器上可能不兼容。可以使用 can I use 网站检查浏览器的支持情况。
HTTPS 连接：访问摄像头通常要求在 HTTPS 环境下运行。如果你在开发时遇到问题，确保你的本地开发服务器使用 HTTPS（例如通过 localhost 和 https 协议）。

import React, {useState, useRef, useEffect} from "react";

const App = () => {
  const [hasVideo, setHasVideo] = useState(false);  // 用来记录是否成功获取视频流
  const videoRef = useRef(null);  // 用来引用 <video> 元素

  useEffect(() => {
    // 定义一个异步函数来获取视频流
    const getVideoStream = async () => {
      try {
        // 获取视频流
        const stream = await navigator.mediaDevices.getUserMedia({
          video: true,  // 只请求视频流，不需要音频
        });

        // 如果获取成功，将视频流绑定到 <video> 元素
        if (videoRef.current) {
          videoRef.current.srcObject = stream;
        }

        // 标记为成功获取视频流
        setHasVideo(true);
      } catch (err) {
        console.error("Error accessing the camera: ", err);
        setHasVideo(false);
      }
    };

    // 执行获取视频流
    getVideoStream();

    // 清理函数，在组件卸载时停止视频流
    return () => {
      if (videoRef.current && videoRef.current.srcObject) {
        const stream = videoRef.current.srcObject;
        const tracks = stream.getTracks();
        tracks.forEach(track => track.stop());  // 停止视频流
      }
    };
  }, []);

  return (
      <div className="video-container">
      {hasVideo ? (
        <video ref={videoRef} autoPlay playsInline className="video-stream" />
      ) : (
        <p>Unable to access the camera.</p>
      )}
    </div>
  );

};

export default App;

效果如下，不过我们需要实画框的并不在vedio对象上，所以需要把vedio上图片在canvas上重新绘制，方便后续在detect的位置再画上方框

增加取帧模型推理的代码

import React, {useEffect, useRef, useState} from 'react';
import "./style/Camera.css";
import cv from "@techstark/opencv-js";
import {download} from "./utils/download";
import {InferenceSession, Tensor} from "onnxruntime-web";
import {detectImage} from "./utils/detect";

const CameraToCanvas = () => {
    const videoRef = useRef(null);
    const canvasRef = useRef(null);

    const [session, setSession] = useState(null);
    const [loading, setLoading] = useState({text: "Loading OpenCV.js", progress: null});
    const imageRef = useRef(null);

    // Configs
    const modelName = "yolov8n.onnx";
    const modelInputShape = [1, 3, 640, 640];
    const topk = 100;
    const iouThreshold = 0.45;
    const scoreThreshold = 0.25;


    useEffect(() => {
        cv["onRuntimeInitialized"] = async () => {
            const baseModelURL = `${process.env.PUBLIC_URL}/model`;

            // create session
            const url = `${baseModelURL}/${modelName}`
            console.log(`url:${url}`)
            const arrBufNet = await download(url, // url
                ["加载 YOLOv8", setLoading] // logger
            );
            const yolov8 = await InferenceSession.create(arrBufNet);
            const arrBufNMS = await download(`${baseModelURL}/nms-yolov8.onnx`, // url
                ["加载 NMS model", setLoading] // logger
            );
            const nms = await InferenceSession.create(arrBufNMS);

            // warmup main model
            setLoading({text: "model 预热...", progress: null});
            const tensor = new Tensor("float32", new Float32Array(modelInputShape.reduce((a, b) => a * b)), modelInputShape);
            await yolov8.run({images: tensor});

            setSession({net: yolov8, nms: nms});
            //window.session = {net: yolov8, nms: nms};
            setLoading(null);
        };

        // 获取摄像头流
        async function startCamera() {
            try {
                const stream = await navigator.mediaDevices.getUserMedia({video: true});
                if (videoRef.current) {
                    videoRef.current.srcObject = stream;
                    videoRef.current.play();
                }
            } catch (error) {
                console.error("Error accessing the camera:", error);
            }
        }

        startCamera();

        const drawToCanvas = async () => {
            const video = videoRef.current;
            const canvas = canvasRef.current;
            const context = canvas.getContext('2d');
            if (video && canvas) {
                canvas.width = video.videoWidth;
                canvas.height = video.videoHeight;
                context.drawImage(video, 0, 0, canvas.width, canvas.height);
                // 把图片赋到imageRef.src上
                imageRef.current.src = canvas.toDataURL();
                //const session = window.session
                if (session) {
                    detectImage(imageRef.current, canvas, session, topk, iouThreshold, scoreThreshold, modelInputShape);
                }
            }
            requestAnimationFrame(drawToCanvas);
        };

        requestAnimationFrame(drawToCanvas);

        return () => {
            if (videoRef.current && videoRef.current.srcObject) {
                const stream = videoRef.current.srcObject;
                const tracks = stream.getTracks();
                tracks.forEach(track => track.stop()); // 停止摄像头流
            }
        };
    }, []);

    return (<div className='video-container'>
        <video ref={videoRef} style={{display: 'none'}}/>
        <canvas ref={canvasRef}/>
        <img
            ref={imageRef}
            src="#"
            alt=""
            style={{display: "none"}}
        />
    </div>);
}

export default CameraToCanvas;

使用模型进行检测的方法

export const detectImage = async (
    image,
    canvas,
    session,
    topk,
    iouThreshold,
    scoreThreshold,
    inputShape
) => {
    const [modelWidth, modelHeight] = inputShape.slice(2);
    const [input, xRatio, yRatio] = preprocessing(image, modelWidth, modelHeight);

    const tensor = new Tensor("float32", input.data32F, inputShape); // to ort.Tensor
    const config = new Tensor(
        "float32",
        new Float32Array([
            topk, // topk per class
            iouThreshold, // iou threshold
            scoreThreshold, // score threshold
        ])
    ); // nms config tensor
    const {output0} = await session.net.run({images: tensor}); // run session and get output layer
    const {selected} = await session.nms.run({detection: output0, config: config}); // perform nms and filter boxes

    const boxes = [];

    // looping through output
    for (let idx = 0; idx < selected.dims[1]; idx++) {
        const data = selected.data.slice(idx * selected.dims[2], (idx + 1) * selected.dims[2]); // get rows
        const box = data.slice(0, 4);
        const scores = data.slice(4); // classes probability scores
        const score = Math.max(...scores); // maximum probability scores
        const label = scores.indexOf(score); // class id of maximum probability scores

        const [x, y, w, h] = [
            (box[0] - 0.5 * box[2]) * xRatio, // upscale left
            (box[1] - 0.5 * box[3]) * yRatio, // upscale top
            box[2] * xRatio, // upscale width
            box[3] * yRatio, // upscale height
        ]; // keep boxes in maxSize range

        boxes.push({
            label: label,
            probability: score,
            bounding: [x, y, w, h], // upscale box
        }); // update boxes to draw later
    }

    if (boxes.length > 0) {
        renderBoxes(canvas, boxes); // Draw boxes
    }
    input.delete(); // delete unused Mat
};

测试下效果，视频流的处理还是卡顿，勉强找了个静态场景测试下