kitti数据集理解及可视化

kitti数据集简介

kitti数据集是比较早出来的3D检测方面的数据集，相对来说数据结构简单，适合做单目检测的工作，目前也是业界和学术界常用的公开数据集。
自己最近也在做单目3D检测的工作，所以也分享一些理解，希望能给到一些人启发和帮助。
kitti数据的目录结构如下：
只需要下载下图所示的四个zip包解压就行，做单目3D检测已经足够。
kitti
解压后以training为例，其中calib里面是传感器的标定参数，有一堆txt文件，内容格式如下：

P0: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 0.000000000000e+00 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
P1: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.875744000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
P2: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 4.485728000000e+01 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.163791000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.745884000000e-03
P3: 7.215377000000e+02 0.000000000000e+00 6.095593000000e+02 -3.395242000000e+02 0.000000000000e+00 7.215377000000e+02 1.728540000000e+02 2.199936000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 2.729905000000e-03
R0_rect: 9.999239000000e-01 9.837760000000e-03 -7.445048000000e-03 -9.869795000000e-03 9.999421000000e-01 -4.278459000000e-03 7.402527000000e-03 4.351614000000e-03 9.999631000000e-01
Tr_velo_to_cam: 7.533745000000e-03 -9.999714000000e-01 -6.166020000000e-04 -4.069766000000e-03 1.480249000000e-02 7.280733000000e-04 -9.998902000000e-01 -7.631618000000e-02 9.998621000000e-01 7.523790000000e-03 1.480755000000e-02 -2.717806000000e-01
Tr_imu_to_velo: 9.999976000000e-01 7.553071000000e-04 -2.035826000000e-03 -8.086759000000e-01 -7.854027000000e-04 9.998898000000e-01 -1.482298000000e-02 3.195559000000e-01 2.024406000000e-03 1.482454000000e-02 9.998881000000e-01 -7.997231000000e-01

p0到p4对应的是rect相机坐标系到像素坐标系的变换矩阵，一般论文中叫内参，实际上是相机内参和相机外参以及畸变矫正融合的矩阵。
这里提几点：
kitti有四个相机，两个彩色的，两个灰度的，左侧彩色(对应序号2)的是有对应的标注信息的，所以如果做单目的话，用p2。
rect相机坐标系是对参考坐标系进行旋转矫正，使得4个相机的光心共面，这样做有一定好处，具体好处是什么可以不用在意。
Tr_velo_to_cam是雷达坐标系到参考相机坐标系。
Tr_imu_to_velo是imu到雷达坐标系。
image_2里面就是图片了。
label_2里面就是对应的标注信息了，3D标注是在rect相机坐标系中的。

Pedestrian 0.00 0 -0.20 712.40 143.00 810.73 307.92 1.89 0.48 1.20 1.84 1.47 8.41 0.01

第一列是类别，第二列是截断程度(0~1)，第三列是遮挡程度(0,1,2,3)，第四列是观察者角度alpha(没啥用)，接下来五列到八列是2d框的信息，xmin ymin xmax ymax四个坐标，九列到十一列是3D框的高宽长，十二到十四列是3D框下底面中心点，最后一列是绕y轴的旋转角。
velodyne里面是雷达。另外一个velodyne_reduced是mmdetection3d转数据格式时生成的，可以不在意。

可视化

main.py文件：

import os
import cv2
import numpy as np
import mayavi.mlab as mlab
from utils import draw_lidar, draw_2dbox, gen_3dbox, project_box3d, draw_project, draw_box3d_lidar


class Kitti:
    def __init__(self, root_path="./kitti", ind=0) -> None:
        self.root_path = root_path
        train_file = os.path.join(root_path, "ImageSets/train.txt")
        with open(train_file, 'r') as f:
            names = f.readlines()
        self.names = [name.rstrip() for name in names]
        self.name = self.names[ind]

    def get_image(self, show=False):
        img_path = os.path.join(self.root_path, "training/image_2", self.name+".png")
        img = cv2.imread(img_path)
        if show and os.path.exists(img_path):
            cv2.imshow("origin image", img)
            if cv2.waitKey(0) == ord("q"):
                cv2.destroyAllWindows()
        return img

    def get_lidar(self, show=False):
        lidar_path = os.path.join(self.root_path, "training/velodyne", self.name+".bin")
        lidar = np.fromfile(lidar_path, dtype=np.float32)
        lidar = lidar.reshape((-1, 4))
        if show:
            fig = mlab.figure(figure=None, bgcolor=(0, 0, 0), engine=None,
                            size=(1000, 500))
            draw_lidar(lidar, fig=fig)
            mlab.show()
        return lidar
    
    def get_calib(self):
        calib = {}
        calib_path = os.path.join(self.root_path, "training/calib", self.name+".txt")
        with open(calib_path, 'r') as cf:
            infos = cf.readlines()
            infos = [x.rstrip() for x in infos]
        for info in infos:
            if len(info) == 0:
                continue
            key, value = info.split(":", 1)
            calib[key] = np.array([float(x) for x in value.split()])
        calib_format = self.format_calib(calib)
        return calib_format

    def format_calib(self, calib):
        calib_format = {}
        # projection matrix from rect coord to image coord.
        rect2image = calib["P2"]
        rect2image = rect2image.reshape([3, 4])
        calib_format["rect2image"] = rect2image
        # projection matrix from lidar coord to reference cam coord.
        lidar2cam = calib["Tr_velo_to_cam"]
        lidar2cam = lidar2cam.reshape([3, 4])
        calib_format["lidar2cam"] = lidar2cam
        # projection matrix from rect cam coord to reference cam coord.
        rect2ref = calib["R0_rect"]
        rect2ref = rect2ref.reshape([3, 3])
        calib_format["rect2ref"] = rect2ref
        return calib_format

    def get_anns(self):
        anns = []
        label_path = os.path.join(self.root_path, "training/label_2", self.name+".txt")
        with open(label_path, 'r') as lf:
            labels = lf.readlines()
            labels = [label.rstrip() for label in labels]
        for label in labels:
            ann_format = {}
            ann = label.split(" ")
            class_name = ann[0]
            ann_format["class_name"]=class_name
            ann_ = [float(x) for x in ann[1:]]
            truncation = ann_[0] # truncated pixel ratio [0..1]
            ann_format["truncation"]=truncation
            occlusion = ann_[1] # 0=visible, 1=partly occluded, 2=fully occluded, 3=unknown
            ann_format["occlusion"]=occlusion
            alpha = ann_[2]
            ann_format["alpha"]=alpha # object observation angle [-pi..pi]

            #2D box
            xmin, ymin, xmax, ymax = ann_[3], ann_[4], ann_[5], ann_[6]
            box2d = np.array([xmin, ymin, xmax, ymax])
            ann_format["box2d"]=box2d

            #3D box
            box3d = {}
            h, w, l = ann_[7], ann_[8], ann_[9]
            cx, cy, cz = ann_[10], ann_[11], ann_[12]
            box3d["dim"] = np.array([l, w, h])
            box3d["center"] = np.array([cx, cy, cz])
            yaw = ann_[13]
            box3d["rotation"] = yaw# yaw angle [-pi..pi]
            ann_format["box3d"]=box3d

            anns.append(ann_format)
        return anns


class VisKitti:
    def __init__(self, root_path="./kitti", ind=0) -> None:
        self.kitti = Kitti(root_path=root_path, ind=ind)
        self.calib = self.kitti.get_calib()
        self.anns = self.kitti.get_anns()

    def show_origin_image(self):
        self.kitti.get_image(show=True)

    def show_origin_lidar(self):
        self.kitti.get_lidar(show=True)

    def show_image_with_2dbox(self, save=False):
        img = self.kitti.get_image()
        bbox = []
        names = []
        for ann in self.anns:
            bbox.append(ann["box2d"])
            names.append(ann["class_name"])
        draw_2dbox(img, bbox, names, save=save)
        

    def show_image_with_project_3dbox(self, show=True):
        img = self.kitti.get_image()
        bbox = []
        for ann in self.anns:
            bbox.append(ann["box3d"])
        bbox3d = gen_3dbox(bbox3d=bbox)
        project_xy,_ = project_box3d(bbox3d, self.calib)
        draw_project(img, project_xy, save=False)
        

    def show_lidar_with_3dbox(self, img_fov=False):
        img = self.kitti.get_image()
        bbox = []
        for ann in self.anns:
            bbox.append(ann["box3d"])
        bbox3d = gen_3dbox(bbox3d=bbox)
        fig = mlab.figure(figure=None, bgcolor=(0, 0, 0), engine=None,
                        size=(1000, 500))
        lidar = self.kitti.get_lidar()
        fig = draw_lidar(lidar, fig=fig)
        fig = draw_box3d_lidar(bbox3d, self.calib, fig)
        mlab.show()     



if __name__ == "__main__":
    vis = VisKitti()
    print("1: show_origin_image")
    print("2: show_origin_lidar")
    print("3: show_image_with_2dbox")
    print("4: show_image_with_project_3dbox")
    print("5: show_lidar_with_3dbox")
    
    choice = input("please choice number:")
    if choice=="1":
        vis.show_origin_image()
    elif choice=="2":
        vis.show_origin_lidar()
    elif choice=="3":
        vis.show_image_with_2dbox()
    elif choice=="4":
        vis.show_image_with_project_3dbox()
    elif choice=="5":
        vis.show_lidar_with_3dbox()

utils.py 文件

import os
import cv2
import numpy as np
import mayavi.mlab as mlab


def draw_lidar(pc,
               color=None,
               fig=None,
               bgcolor=(0, 0, 0),
               pts_scale=0.3,
               pts_mode="point",
               pts_color=None,
               color_by_intensity=False,
               ):
    if fig == None:
        fig = mlab.figure(figure=None, bgcolor=bgcolor, engine=None,
                          size=(1600, 1000))
    if color == None:
        color = pc[:, 2]

    s = pc[:, 3] if color_by_intensity else pc[:, 2]
    mlab.points3d(pc[:, 0],
                  pc[:, 1],
                  pc[:, 2],
                  s,
                  color=pts_color,
                  mode=pts_mode,
                  colormap="gnuplot",
                  scale_factor=pts_scale,
                  figure=fig)

    # draw origin point
    mlab.points3d(0, 0, 0, color=(1, 1, 1), mode="sphere", scale_factor=0.2, figure=fig)

    # draw axis
    axis_xyz = np.array([[2, 0, 0],[0, 2, 0],[0, 0, 2]], dtype=np.float64)
    # x axis
    mlab.plot3d([0, axis_xyz[0, 0]],
                [0, axis_xyz[0, 1]],
                [0, axis_xyz[0, 2]],
                color=(1, 0, 0),
                tube_radius=None,
                figure=fig)
    # y axis
    mlab.plot3d([0, axis_xyz[1, 0]],
                [0, axis_xyz[1, 1]],
                [0, axis_xyz[1, 2]],
                color=(0, 1, 0),
                tube_radius=None,
                figure=fig)
    # z axis
    mlab.plot3d([0, axis_xyz[2, 0]],
                [0, axis_xyz[2, 1]],
                [0, axis_xyz[2, 2]],
                color=(0, 0, 1),
                tube_radius=None,
                figure=fig)
    mlab.view(
        azimuth=180,
        elevation=70,
        focalpoint=[12.0909996, -1.04700089, -2.03249991],
        distance=62.0,
        figure=fig,
    )
    return fig

def draw_2dbox(img, bbox, names=None, save=False):
    assert len(bbox)==len(names), "names not match bbox"
    color_map = {"Car":(0, 255, 0), "Pedestrian":(255, 0, 0), "Cyclist":(0, 0, 255)}
    for i, box in enumerate(bbox):
        name = names[i]
        if name not in color_map.keys():
            continue
        color = color_map[name]
        cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color,
            2,
        )
        name_coord = (int(box[0]), int(max(box[1]-5, 0)))
        cv2.putText(img, name, name_coord, 
                    cv2.FONT_HERSHEY_PLAIN, 1, color, 1)

    cv2.imshow("image_with_2dbox", img)
    if cv2.waitKey(0) == ord("q"):
        cv2.destroyAllWindows()
    if save:
        cv2.imwrite("image_with_2dbox.jpg", img)

def rotx(t):
    """ 3D Rotation about the x-axis. """
    c = np.cos(t)
    s = np.sin(t)
    return np.array([[1, 0, 0], [0, c, -s], [0, s, c]])


def roty(t):
    """ Rotation about the y-axis. """
    c = np.cos(t)
    s = np.sin(t)
    return np.array([[c, 0, s], [0, 1, 0], [-s, 0, c]])


def rotz(t):
    """ Rotation about the z-axis. """
    c = np.cos(t)
    s = np.sin(t)
    return np.array([[c, -s, 0], [s, c, 0], [0, 0, 1]])

def expand_matrix(matrix):
    new_matrix = np.eye(4, 4)
    new_matrix[:3, :] = matrix
    return new_matrix

def gen_3dbox(bbox3d):
    corners_3d_all = []
    for box in bbox3d:
        center = box["center"]
        l, w, h = box["dim"]
        angle = box["rotation"]
        R = roty(angle)
        # 3d bounding box corners
        x_corners = [l / 2, l / 2, -l / 2, -l / 2, l / 2, l / 2, -l / 2, -l / 2]
        y_corners = [0, 0, 0, 0, -h, -h, -h, -h]
        z_corners = [w / 2, -w / 2, -w / 2, w / 2, w / 2, -w / 2, -w / 2, w / 2]

        corners = np.vstack([x_corners, y_corners, z_corners])
        corners_3d = np.dot(R, corners)
        corners_3d[0, :] += center[0]
        corners_3d[1, :] += center[1]
        corners_3d[2, :] += center[2]
        corners_3d_all.append(corners_3d)
    return corners_3d_all

def project_box3d(bbox3d, calib):
    P = calib["rect2image"]
    P = expand_matrix(P)
    project_xy = []
    project_z = []
    for box3d in bbox3d:
        if np.any(box3d[:, 2] < 0.1):
            continue
        box3d = np.concatenate([box3d, np.zeros((1, 8))], axis=0)
        project_3dbox = np.dot(P, box3d)[:3, :]
        pz = project_3dbox[2, :]
        px = project_3dbox[0, :]/pz
        py = project_3dbox[1, :]/pz
        xy = np.stack([px, py], axis=1)
        project_xy.append(xy)
        project_z.append(pz)
    return project_xy, project_z

def draw_project(img, project_xy, save=False):
    color_map = {"Car":(0, 255, 0), "Pedestrian":(255, 0, 0), "Cyclist":(0, 0, 255)}
    for i, qs in enumerate(project_xy):
        color = (0, 255, 0)
        qs = qs.astype(np.int32)
        for k in range(0, 4):
            i, j = k, (k + 1) % 4
            # use LINE_AA for opencv3
            cv2.line(img, (qs[i, 0], qs[i, 1]), (qs[j, 0], qs[j, 1]), color, 1)
            i, j = k + 4, (k + 1) % 4 + 4
            cv2.line(img, (qs[i, 0], qs[i, 1]), (qs[j, 0], qs[j, 1]), color, 1)
            i, j = k, k + 4
            cv2.line(img, (qs[i, 0], qs[i, 1]), (qs[j, 0], qs[j, 1]), color, 1)

    cv2.imshow("image_with_projectbox", img)
    if cv2.waitKey(0) == ord("q"):
        cv2.destroyAllWindows()
    if save:
        cv2.imwrite("image_with_projectbox.jpg", img)

def draw_gt_boxes3d(
    gt_boxes3d,
    fig,
    color=(1, 1, 1),
    line_width=1,
):
    """ Draw 3D bounding boxes
    Args:
        gt_boxes3d: numpy array (n,8,3) for XYZs of the box corners
        fig: mayavi figure handler
        color: RGB value tuple in range (0,1), box line color
        line_width: box line width
        draw_text: boolean, if true, write box indices beside boxes
        text_scale: three number tuple
        color_list: a list of RGB tuple, if not None, overwrite color.
    Returns:
        fig: updated fig
    """
    num = len(gt_boxes3d)
    for n in range(num):
        b = gt_boxes3d[n]

        for k in range(0, 4):
            i, j = k, (k + 1) % 4
            mlab.plot3d(
                [b[i, 0], b[j, 0]],
                [b[i, 1], b[j, 1]],
                [b[i, 2], b[j, 2]],
                color=color,
                tube_radius=None,
                line_width=line_width,
                figure=fig,
            )

            i, j = k + 4, (k + 1) % 4 + 4
            mlab.plot3d(
                [b[i, 0], b[j, 0]],
                [b[i, 1], b[j, 1]],
                [b[i, 2], b[j, 2]],
                color=color,
                tube_radius=None,
                line_width=line_width,
                figure=fig,
            )

            i, j = k, k + 4
            mlab.plot3d(
                [b[i, 0], b[j, 0]],
                [b[i, 1], b[j, 1]],
                [b[i, 2], b[j, 2]],
                color=color,
                tube_radius=None,
                line_width=line_width,
                figure=fig,
            )
    return fig

def inverse_rigid_trans(Tr):
    """ Inverse a rigid body transform matrix (3x4 as [R|t])
        [R'|-R't; 0|1]
    """
    inv_Tr = np.zeros_like(Tr)  # 3x4
    inv_Tr[0:3, 0:3] = np.transpose(Tr[0:3, 0:3])
    inv_Tr[0:3, 3] = np.dot(-np.transpose(Tr[0:3, 0:3]), Tr[0:3, 3])
    return inv_Tr

def draw_box3d_lidar(bbox3d, calib, fig):
    # method 1
    # lidar2cam = calib["lidar2cam"]
    # lidar2cam = expand_matrix(lidar2cam)
    # cam2rect_ = calib["rect2ref"]
    # cam2rect = np.eye(4, 4)
    # cam2rect[:3, :3] = cam2rect_
    # lidar2rec = np.dot(lidar2cam, cam2rect)
    # rec2lidar = np.linalg.inv(lidar2rec) #(AB)-1 = B-1@A-1

    # method 2
    lidar2cam_ = calib["lidar2cam"]
    cam2rect_ = calib["rect2ref"]
    cam2rect = np.eye(4, 4)
    cam2rect[:3, :3] = cam2rect_
    
    # lidar2cam = np.eye(4, 4)
    # lidar2cam[:3, :] = lidar2cam_
    # cam2lidar = np.linalg.inv(lidar2cam)
    cam2lidar_ = inverse_rigid_trans(lidar2cam_)
    cam2lidar = np.eye(4, 4)
    cam2lidar[:3, :] = cam2lidar_

    all_lidar_box3d = []
    for box3d in bbox3d:
        if np.any(box3d[:, 2] < 0.1):
            continue
        box3d = np.concatenate([box3d, np.ones((1, 8))], axis=0)
        box3d_in_refcam = np.dot(np.linalg.inv(cam2rect), box3d)
        lidar_box3d = np.dot(cam2lidar, box3d_in_refcam)[:3, :]
        lidar_box3d = np.transpose(lidar_box3d)
        all_lidar_box3d.append(lidar_box3d)
    print(all_lidar_box3d)
    fig = draw_gt_boxes3d(all_lidar_box3d, fig)
    return fig