支持超高分辨率图片生成，UltraPixel模型分享

news2025/7/17 13:24:19

UltraPixel是一种由华为诺亚方舟实验室联合香港科技大学共同开发的超高清图像合成架构，旨在生成具有丰富细节的高质量图像，其分辨率可以从1K一直延伸至6K。

UltraPixel不仅仅是一个图像放大工具，它还能在生成过程中优化细节，提升整体图像的质量。

UltraPixel利用级联扩散模型，通过低分辨率图像的语义丰富表示来指导高分辨率图像的生成，显著降低了生成复杂性。

此外，UltraPixel还引入了隐式神经表示（INR）进行连续上采样，以及适应不同分辨率的尺度感知归一化层，确保了在生成不同分辨率图像时的一致性和高质量输出。

在低分辨率和高分辨率处理过程中，UltraPixel在最紧凑的空间内进行操作，绝大多数参数是共享的，高分辨率输出仅增加了不到3%的额外参数，大大提高了训练和推理的效率。

github项目地址：https://github.com/catcathh/UltraPixel。

一、环境安装

1、python环境

建议安装python版本在3.10以上。

2、pip库安装

pip install torch==2.1.2+cu118 torchvision==0.16.2+cu118 torchaudio==2.1.2 --extra-index-url https://download.pytorch.org/whl/cu118

pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple

3、UltraPixel模型下载：

git lfs install

git clone https://huggingface.co/roubaofeipi/UltraPixel

4、StableWurst模型下载：

git lfs install

git clone https://huggingface.co/stabilityai/StableWurst

5、CLIP-ViT-bigG-14-laion2B-39B-b160k模型下载：

git lfs install

git clone https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k

6、clip-vit-large-patch14模型下载：

git lfs install

git clone https://huggingface.co/openai/clip-vit-large-patch14

二、功能测试

1、运行测试：

（1）Text-guided引导的python代码调用测试

import os
import sys
import yaml
import torch
import random
import numpy as np
import argparse
from tqdm import tqdm
from einops import rearrange

from inference.utils import *
from core.utils import load_or_fail
from train import WurstCoreB, WurstCore_t2i as WurstCoreC
from gdf import VPScaler, CosineTNoiseCond, DDPMSampler, P2LossWeight, AdaptiveLossWeight

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--height', type=int, default=2560, help='image height')
    parser.add_argument('--width', type=int, default=5120, help='image width')
    parser.add_argument('--seed', type=int, default=123, help='random seed')
    parser.add_argument('--dtype', type=str, default='bf16', help='datatype, if bf16 does not work, change to float32')
    parser.add_argument('--config_c', type=str, default='configs/training/t2i.yaml', help='config file for stage C, latent generation')
    parser.add_argument('--config_b', type=str, default='configs/inference/stage_b_1b.yaml', help='config file for stage B, latent decoding')
    parser.add_argument('--prompt', type=str, default='A photo-realistic image of a west highland white terrier in the garden, high quality, detail rich', help='text prompt')
    parser.add_argument('--num_image', type=int, default=2, help='number of generated images')
    parser.add_argument('--output_dir', type=str, default='figures/output_results/', help='output directory for generated images')
    parser.add_argument('--stage_a_tiled', action='store_true', help='whether or not to use tiled decoding for stage A to save memory')
    parser.add_argument('--pretrained_path', type=str, default='UltraPixel/ultrapixel_t2i.safetensors', help='pretrained path of newly added parameter of UltraPixel')
    return parser.parse_args()

def setup_model(config_file, core_class, device, training):
    with open(config_file, "r", encoding="utf-8") as file:
        loaded_config = yaml.safe_load(file)
    core = core_class(config_dict=loaded_config, device=device, training=training)
    return core, core.setup_extras_pre(), core.setup_models(core.setup_extras_pre())

def main():
    args = parse_args()
    print(args)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)
    dtype = torch.bfloat16 if args.dtype == 'bf16' else torch.float

    # Setup Stage C
    core, extras, models = setup_model(args.config_c, WurstCoreC, device, training=False)
    models.generator.eval().requires_grad_(False)
    print("STAGE C READY")

    # Setup Stage B
    core_b, extras_b, models_b = setup_model(args.config_b, WurstCoreB, device, training=False)
    models_b = WurstCoreB.Models(**{**models_b.to_dict(), 'tokenizer': models.tokenizer, 'text_model': models.text_model})
    models_b.generator.bfloat16().eval().requires_grad_(False)
    print("STAGE B READY")

    captions = [args.prompt] * args.num_image
    height, width = args.height, args.width
    save_dir = args.output_dir
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Load Pretrained Model
    pretrained_path = args.pretrained_path    
    sdd = torch.load(pretrained_path, map_location='cpu')
    collect_sd = {k[7:]: v for k, v in sdd.items()}
    models.train_norm.load_state_dict(collect_sd)
    models.generator.eval()
    models.train_norm.eval()

    batch_size = 1
    height_lr, width_lr = get_target_lr_size(height / width, std_size=32)
    stage_c_latent_shape, stage_b_latent_shape = calculate_latent_sizes(height, width, batch_size=batch_size)
    stage_c_latent_shape_lr, stage_b_latent_shape_lr = calculate_latent_sizes(height_lr, width_lr, batch_size=batch_size)

    # Stage C Parameters
    extras.sampling_configs.update({
        'cfg': 4,
        'shift': 1,
        'timesteps': 20,
        't_start': 1.0,
        'sampler': DDPMSampler(extras.gdf)
    })
    
    # Stage B Parameters
    extras_b.sampling_configs.update({
        'cfg': 1.1,
        'shift': 1,
        'timesteps': 10,
        't_start': 1.0
    })
    
    for cnt, caption in enumerate(captions):
        batch = {'captions': [caption] * batch_size}

        conditions = core.get_conditions(batch, models, extras, is_eval=True, is_unconditional=False, eval_image_embeds=False)
        unconditions = core.get_conditions(batch, models, extras, is_eval=True, is_unconditional=True, eval_image_embeds=False)

        with torch.no_grad():
            models.generator.cuda()
            print('STAGE C GENERATION***************************')
            with torch.cuda.amp.autocast(dtype=dtype):
                sampled_c = generation_c(batch, models, extras, core, stage_c_latent_shape, stage_c_latent_shape_lr, device)

            models.generator.cpu()
            torch.cuda.empty_cache()

            conditions_b = core_b.get_conditions(batch, models_b, extras_b, is_eval=True, is_unconditional=False)
            unconditions_b = core_b.get_conditions(batch, models_b, extras_b, is_eval=True, is_unconditional=True)
            conditions_b['effnet'] = sampled_c
            unconditions_b['effnet'] = torch.zeros_like(sampled_c)

            print('STAGE B + A DECODING***************************')
            with torch.cuda.amp.autocast(dtype=dtype):
                sampled = decode_b(conditions_b, unconditions_b, models_b, stage_b_latent_shape, extras_b, device, stage_a_tiled=args.stage_a_tiled)

            torch.cuda.empty_cache()
            imgs = show_images(sampled)
            for idx, img in enumerate(imgs):
                img_path = os.path.join(save_dir, f"{args.prompt[:20]}_{cnt:05}.jpg")
                print(img_path, idx)
                img.save(img_path)

    print(f'Finished! Results at {save_dir}')

if __name__ == "__main__":
    main()

未完......

更多详细的欢迎关注：杰哥新技术