CR-NeRF 代码eval.py解析

这段代码是一个用于CR-NeRF（Neural Radiance Fields）模型的推理脚本。它主要用于生成和保存渲染的图像，并计算图像质量的评价指标（如PSNR和SSIM）。以下是对这段代码的详细解析：

（1）导入了所需的库和模块

包括PyTorch、NumPy、tqdm（用于进度条）、imageio（用于图像保存）、以及其他自定义模块和函数。

import torch
import os
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import imageio
from argparse import ArgumentParser

from models.rendering import render_rays_cross_ray
from models.nerf import *
from models.nerf_decoder_stylenerf import get_renderer
from utils import load_ckpt
import metrics
from einops import rearrange
from datasets import dataset_dict
from datasets.depth_utils import *
from models.linearStyleTransfer import encoder3, encoder_sameoutputsize
from models.networks import E_attr
from math import sqrt
import math
import json
from PIL import Image
from torchvision import transforms as T
from opt import get_opts
from train_mask_grid_sample import get_model
torch.backends.cudnn.benchmark = True

（2）定义函数

batched_inference()，这个函数用于对光线进行批量推理。它将光线分成小块进行处理，以避免内存不足的问题。

from collections import defaultdict
import torch

def batched_inference(models, embeddings,
                      rays, ts, N_samples, N_importance, use_disp,
                      chunk,
                      white_back,
                      **kwargs):
    """
    对光线进行批量推理。

    参数:
    models: 包含模型（如粗略模型和精细模型）的字典。
    embeddings: 包含位置和方向嵌入的字典。
    rays: 光线数据，形状为 [B, 6]，其中 B 是光线的数量。
    ts: 时间戳数据，形状为 [B]。
    N_samples: 每条光线的样本数量。
    N_importance: 重要性采样的样本数量。
    use_disp: 是否使用视差。
    chunk: 每个小块的大小。
    white_back: 背景是否为白色。
    **kwargs: 其他关键字参数。

    返回:
    包含渲染结果的字典。
    """
    B = rays.shape[0]  # 光线的总数
    results = defaultdict(list)  # 用于存储每个键的结果列表

    # 循环处理每个小块的光线
    for i in range(0, B, chunk):
        rendered_ray_chunks = \
            render_rays_cross_ray(models,
                        embeddings,
                        rays[i:i+chunk],  # 当前小块的光线
                        ts[i:i+chunk] if ts is not None else None,  # 当前小块的时间戳
                        N_samples,
                        use_disp,
                        0,
                        0,
                        N_importance,
                        chunk,
                        white_back,
                        test_time=True,
                        **kwargs)

        # 将渲染结果中的每个键值对添加到 results 字典中
        for k, v in rendered_ray_chunks.items():
            results[k] += [v]

    # 将 results 字典中的每个键的结果列表合并成一个张量
    for k, v in results.items():
        results[k] = torch.cat(v, 0)

    return results  # 返回合并后的结果字典

定义函数eulerAnglesToRotationMatrix()，这个函数用于将欧拉角转换为旋转矩阵。

def eulerAnglesToRotationMatrix(theta):
    R_x = np.array([[1,         0,                  0                   ],
                    [0,         math.cos(theta[0]), -math.sin(theta[0]) ],
                    [0,         math.sin(theta[0]), math.cos(theta[0])  ]
                    ])
    R_y = np.array([[math.cos(theta[1]),    0,      math.sin(theta[1])  ],
                    [0,                     1,      0                   ],
                    [-math.sin(theta[1]),   0,      math.cos(theta[1])  ]
                    ])
                
    R_z = np.array([[math.cos(theta[2]),    -math.sin(theta[2]),    0],
                    [math.sin(theta[2]),    math.cos(theta[2]),     0],
                    [0,                     0,                      1]
                    ])
    R = np.dot(R_z, np.dot( R_y, R_x ))
    return R

（3）主程序

这段代码是主程序的开始部分，主要负责初始化参数、加载数据集、定义嵌入和编码器等。

if __name__ == "__main__":
    # 检查是否是主程序入口
    args = get_opts()
    # 获取命令行参数，存储在args对象中

    kwargs = {'root_dir': args.root_dir, 'split': args.split}
    # 初始化关键字参数字典，包含根目录和数据集分割信息

    if args.dataset_name == 'blender':
        # 如果数据集名称是'blender'
        kwargs['img_wh'] = tuple(args.img_wh)
        # 添加图像宽度和高度到关键字参数字典
    else:
        # 否则
        kwargs['img_downscale'] = args.img_downscale
        kwargs['use_cache'] = args.use_cache
        # 添加图像降采样因子和是否使用缓存到关键字参数字典

    dataset = dataset_dict[args.dataset_name](args=args, **kwargs)
    # 根据数据集名称初始化数据集对象

    scene = os.path.basename(args.root_dir.strip('/'))
    # 获取场景名称，即根目录的最后一个部分

    embedding_xyz = PosEmbedding(args.N_emb_xyz-1, args.N_emb_xyz)
    embedding_dir = PosEmbedding(args.N_emb_dir-1, args.N_emb_dir)
    # 初始化位置编码和方向编码对象

    embeddings = {'xyz': embedding_xyz, 'dir': embedding_dir}
    # 将位置编码和方向编码存储在字典中

    if args.encode_a:
        # 如果启用了外观编码
        enc_a = encoder_sameoutputsize(out_channel=args.nerf_out_dim).cuda()
        # 初始化外观编码器并将其移动到GPU

        load_ckpt(enc_a, args.ckpt_path, model_name='enc_a')
        # 从检查点文件加载外观编码器的权重

        kwargs = {}
        # 重置关键字参数字典

        if args.dataset_name == 'blender':
            # 如果数据集名称是'blender'
            with open(os.path.join(args.root_dir, f"transforms_train.json"), 'r') as f:
                meta_train = json.load(f)
            # 读取训练数据的变换信息

            frame = meta_train['frames'][0]
            # 获取第一帧的信息

            image_path = os.path.join(args.root_dir, f"{frame['file_path']}.png")
            # 构建图像文件路径

            img = Image.open(image_path)
            img = img.resize(args.img_wh, Image.LANCZOS)
            # 打开图像并调整大小

            toTensor = T.ToTensor()
            normalize = T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
            # 初始化图像转换工具

            img = toTensor(img) # (4, h, w)
            img = img[:3, :, :]*img[-1:, :, :] + (1-img[-1:, :, :]) # blend A to RGB (3, h, w)
            # 将图像转换为Tensor并进行预处理

            whole_img = normalize(img).unsqueeze(0).cuda()
            # 归一化图像并将其移动到GPU

            kwargs['a_embedded_from_img'] = enc_a(whole_img)
            # 使用外观编码器对图像进行编码，并将结果存储在关键字参数字典中

（4）模型加载和初始化

这段代码加载了NeRF模型和解码器，并从checkpoints文件中恢复它们的权重。

models=get_model(args)
nerf_coarse=models['coarse']
nerf_fine=models['fine']
decoder=models['decoder']
load_ckpt(nerf_coarse, args.ckpt_path, model_name='nerf_coarse')
load_ckpt(nerf_fine, args.ckpt_path, model_name='nerf_fine')
load_ckpt(decoder, args.ckpt_path, model_name='decoder')

（5）数据集预处理

为不同场景中的场景进行特定的预处理，包括图像的读取、下采样、归一化以及相机姿态的生成。每个场景有其特定的处理逻辑以确保测试数据的一致性和合理性。

# 初始化保存图像和度量结果的列表
imgs, psnrs, ssims = [], [], []

# 设置结果保存目录并创建该目录
dir_name = os.path.join(args.save_dir, f'results/{args.dataset_name}/{args.scene_name}')
os.makedirs(dir_name, exist_ok=True)

# 设置 kwargs 参数
kwargs['args']=args

# 如果数据集是 phototourism 且数据划分为测试集，进行特定处理
if args.dataset_name == 'phototourism' and args.split == 'test':
    # 定义测试图像的宽度和高度
    dataset.test_img_w, dataset.test_img_h = args.img_wh
    
    # 计算焦距，定义相机内参 (fov=60 degrees)
    dataset.test_focal = dataset.test_img_w / 2 / np.tan(np.pi/6)
    dataset.test_K = np.array([
        [dataset.test_focal, 0, dataset.test_img_w / 2],
        [0, dataset.test_focal, dataset.test_img_h / 2],
        [0, 0, 1]
    ])
    
    # 根据不同的场景进行不同的处理
    if scene == 'brandenburg_gate':
        # 选择特定图像作为外观嵌入
        img = Image.open(os.path.join(args.root_dir, 'dense/images', dataset.image_paths[dataset.img_ids_train[314]])).convert('RGB')
        img_downscale = 8
        img_w, img_h = img.size
        img_w = img_w // img_downscale
        img_h = img_h // img_downscale
        img = img.resize((img_w, img_h), Image.LANCZOS)
        
        # 对图像进行归一化和转换为张量
        toTensor = T.ToTensor()
        normalize = T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
        img = toTensor(img)
        whole_img = normalize(img).unsqueeze(0).cuda()
        kwargs['a_embedded_from_img'] = enc_a(whole_img)
        
        dataset.test_appearance_idx = 314
        N_frames = 30 * 8

        # 定义相机的轨迹变换参数
        dx1 = np.linspace(-0.25, 0.25, N_frames)
        dx2 = np.linspace(0.25, 0.38, N_frames - N_frames // 2)
        dx = np.concatenate((dx1, dx2))

        dy1 = np.linspace(0.05, -0.1, N_frames // 2)
        dy2 = np.linspace(-0.1, 0.05, N_frames - N_frames // 2)
        dy = np.concatenate((dy1, dy2))

        dz1 = np.linspace(0.1, 0.3, N_frames // 2)
        dz2 = np.linspace(0.3, 0.1, N_frames - N_frames // 2)
        dz = np.concatenate((dz1, dz2))

        theta_x1 = np.linspace(math.pi / 30, 0, N_frames // 2)
        theta_x2 = np.linspace(0, math.pi / 30, N_frames - N_frames // 2)
        theta_x = np.concatenate((theta_x1, theta_x2))

        theta_y = np.linspace(math.pi / 10, -math.pi / 10, N_frames)
        theta_z = np.linspace(0, 0, N_frames)

        # 复制初始的相机姿态并在每一帧上应用变换
        dataset.poses_test = np.tile(dataset.poses_dict[1123], (N_frames, 1, 1))
        for i in range(N_frames):
            dataset.poses_test[i, 0, 3] += dx[i]
            dataset.poses_test[i, 1, 3] += dy[i]
            dataset.poses_test[i, 2, 3] += dz[i]
            dataset.poses_test[i, :, :3] = np.dot(eulerAnglesToRotationMatrix([theta_x[i],theta_y[i],theta_z[i]]), dataset.poses_test[i, :, :3])
    
    elif scene == 'trevi_fountain':
        # 选择特定图像作为外观嵌入
        img = Image.open(os.path.join(args.root_dir, 'dense/images', dataset.image_paths[dataset.img_ids_train[1548]])).convert('RGB')
        img_downscale = 8
        img_w, img_h = img.size
        img_w = img_w // img_downscale
        img_h = img_h // img_downscale
        img = img.resize((img_w, img_h), Image.LANCZOS)
        
        # 对图像进行归一化和转换为张量
        toTensor = T.ToTensor()
        normalize = T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
        img = toTensor(img)
        whole_img = normalize(img).unsqueeze(0).cuda()
        kwargs['a_embedded_from_img'] = enc_a(whole_img)

        dataset.test_appearance_idx = dataset.img_ids_train[1548]
        N_frames = 30 * 8

        # 定义相机的轨迹变换参数
        dx = np.linspace(-0.8, 0.7, N_frames)
        dy1 = np.linspace(-0., 0.05, N_frames // 2)
        dy2 = np.linspace(0.05, -0., N_frames - N_frames // 2)
        dy = np.concatenate((dy1, dy2))

        dz1 = np.linspace(0.4, 0.1, N_frames // 4)
        dz2 = np.linspace(0.1, 0.5, N_frames // 4)
        dz3 = np.linspace(0.5, 0.1, N_frames // 4)
        dz4 = np.linspace(0.1, 0.4, N_frames - 3 * (N_frames // 4))
        dz = np.concatenate((dz1, dz2, dz3, dz4))

        theta_x1 = np.linspace(-0, 0, N_frames // 2)
        theta_x2 = np.linspace(0, -0, N_frames - N_frames // 2)
        theta_x = np.concatenate((theta_x1, theta_x2))

        theta_y = np.linspace(math.pi / 6, -math.pi / 6, N_frames)
        theta_z = np.linspace(0, 0, N_frames)

        # 复制初始的相机姿态并在每一帧上应用变换
        dataset.poses_test = np.tile(dataset.poses_dict[dataset.img_ids_train[1548]], (N_frames, 1, 1))
        for i in range(N_frames):
            dataset.poses_test[i, 0, 3] += dx[i]
            dataset.poses_test[i, 1, 3] += dy[i]
            dataset.poses_test[i, 2, 3] += dz[i]
            dataset.poses_test[i, :, :3] = np.dot(eulerAnglesToRotationMatrix([theta_x[i],theta_y[i],theta_z[i]]), dataset.poses_test[i, :, :3])

    elif scene == 'sacre_coeur':
        # 选择特定图像作为外观嵌入
        img = Image.open(os.path.join(args.root_dir, 'dense/images', dataset.image_paths[dataset.img_ids_train[58]])).convert('RGB')
        img_downscale = 8
        img_w, img_h = img.size
        img_w = img_w // img_downscale
        img_h = img_h // img_downscale
        img = img.resize((img_w, img_h), Image.LANCZOS)
        
        # 对图像进行归一化和转换为张量
        toTensor = T.ToTensor()
        normalize = T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
        img = toTensor(img)
        whole_img = normalize(img).unsqueeze(0).cuda()
        kwargs['a_embedded_from_img'] = enc_a(whole_img)

        dataset.test_appearance_idx = dataset.img_ids_train[58]
        N_frames = 30 * 8

        # 定义相机的轨迹变换参数
        dx = np.linspace(-2, 2, N_frames)
        dy1 = np.linspace(-0., 2, N_frames // 2)
        dy2 = np.linspace(2, -0., N_frames - N_frames // 2)
        dy = np.concatenate((dy1, dy2))

        dz1 = np.linspace(0, -3, N_frames // 2)
        dz2 = np.linspace(-3, 0, N_frames - N_frames // 2)
        dz = np.concatenate((dz1, dz2))

        theta_x1 = np.linspace(-0, 0, N_frames // 2)
        theta_x2 = np.linspace(0, -0, N_frames - N_frames // 2)
        theta_x = np.concatenate((

（6）渲染和保存图像

遍历数据集中的每个样本，使用NeRF模型进行渲染，并将渲染结果保存为图像文件

    # 遍历数据集：
    for i in tqdm(range(len(dataset))):
        #  使用 tqdm库创建一个进度条，遍历数据集中的每个样本。
        # 获取样本数据：
        sample = dataset[i]
        rays = sample['rays']
        ts = sample['ts']
        # 从数据集中获取当前样本的光线（rays）和时间戳（ts）。

        # 处理测试集和外观编码：
        if args.split == 'test_test' and args.encode_a:
            whole_img = sample['whole_img'].unsqueeze(0).cuda()
            whole_img=(whole_img+1)/2
            kwargs['a_embedded_from_img'] = enc_a(whole_img)
         # 如果当前是测试集并且启用了外观编码，则对整个图像进行处理并生成外观嵌入。

        # 进行批量推理：
        results = batched_inference(models, embeddings, rays.cuda(), ts.cuda(),
                                    args.N_samples, args.N_importance, args.use_disp,
                                    args.chunk,
                                    dataset.white_back,
                                    **kwargs)
        # 调用 batched_inference函数进行批量推理，获取渲染结果。

        # 处理图像尺寸：
        if args.dataset_name == 'blender':
            w, h = args.img_wh
        else:
            w, h = sample['img_wh']
        # 根据数据集类型获取图像的宽度和高度。

        # 处理特征：
        feature=results['feature_fine'] #torch.Size([699008, 4])
        print("using fine feature")
        lastdim=feature.size(-1)
        feature = rearrange(feature, 'n1 n3 -> n3 n1', n3=lastdim)
        feature = rearrange(feature, ' n3 (h w) ->  1 n3 h w',  h=int(h), w=int(w),n3=lastdim)  ##torch.Size([1, 64, 340, 514])

        # 从渲染结果中获取精细特征，并重新排列其形状以匹配解码器输入格式。

        # 解码特征并生成RGB图像：
        rgbs_pred=models['decoder'](feature, kwargs['a_embedded_from_img'])
        rgbs_pred=rearrange(rgbs_pred, ' 1 n1 h w ->  (h w) n1',  h=int(h), w=int(w),n1=3)
        results['rgb_fine']=rgbs_pred.cpu()
        
        #保存渲染图象
        img_pred = np.clip(results['rgb_fine'].view(h, w, 3).detach().numpy(), 0, 1)
        img_pred_ = (img_pred*255).astype(np.uint8)
        imgs += [img_pred_]
        imageio.imwrite(os.path.join(dir_name, f'{i:03d}.png'), img_pred_)
        print("image saving path",os.path.join(dir_name, f'{i:03d}.png'))

    # 将渲染的RGB图像转换为NumPy数组，并保存为PNG文件。同时，将图像添加到 imgs列表中。

    if args.dataset_name == 'blender' or \
      (args.dataset_name == 'phototourism' and args.split == 'test'):
        imageio.mimsave(os.path.join(dir_name, f'{args.scene_name}.{args.video_format}'),
                        imgs, fps=30)
    print('Done')