文章目录
- 最近邻插值(加速方法)
- (1)scipy.ndimage.zoom
- (2)Numba-CPU加速
- (3)Numba-GPU加速
- (4)Numba-CPU加速(Z轴切块)
- (5)Numba-CPU加速(XYZ轴切块)
- (6)Numba-CPU加速(XYZ轴切块)+ 多线程
输入数据 | 插值倍数 | 时耗 | |
---|---|---|---|
scipy.ndimage.zoom | (1024, 1024, 512) | 4 | 172.16s |
Numba-CPU | (1024, 1024, 512) | 4 | 56.58s |
Numba-GPU | (1024, 1024, 512) | 4 | 122.51s |
Numba-CPU(Z轴切块) | (1024, 1024, 512) | 4 | 52.44 |
Numba-CPU(XYZ轴切块) | (1024, 1024, 512) | 4 | 72.69s |
Numba-CPU(XYZ轴切块)+ 多线程 | (1024, 1024, 512) | 4 | 50.20s |
为什么使用 GPU 反而更慢了:
- (1)数据传输开销:GPU 的计算速度快,但数据在 CPU 和 GPU 之间传输时会有较大的开销。
- (2)任务并行性不高:GPU 适合大规模并行计算,如果任务的并行性不够高,比如 Z 轴切块后的任务处理,GPU 的潜力可能无法得到充分发挥。相比之下,CPU 在处理较小规模任务时可能表现得更有效率。
- (3)专用 GPU 内存不足,自动转共享 GPU 内存(时耗增加)。
最近邻插值(加速方法)
(1)scipy.ndimage.zoom
from scipy.ndimage import zoom
import time
import numpy as np
if __name__ == "__main__":
# (1)创建数组
input_data = np.zeros((1024, 1024, int(1024 * 0.5))).astype(bool)
input_data[50:600, 200:1000, 5:30] = 1
# (2)插值计算
start_time = time.time()
#############################################################
zoom_factor = [2, 2, 2] # 指定插值因子
output_data = zoom(input_data, zoom_factor, order=0)
#############################################################
print("计算时间:", time.time() - start_time)
print("获取非零元素的数量(输入):", np.count_nonzero(input_data))
print("获取非零元素的数量(输出):", np.count_nonzero(output_data))
print("原始数据:", input_data.shape)
print("插值结果:", output_data.shape)
"""
##### 插值两倍 #####
计算时间: 21.160449504852295
获取非零元素的数量(输入): 11000000
获取非零元素的数量(输出): 88000000
原始数据: (1024, 1024, 512)
插值结果: (2048, 2048, 1024)
##### 插值四倍 #####
计算时间: 172.16581082344055
获取非零元素的数量(输入): 11000000
获取非零元素的数量(输出): 704760200
原始数据: (1024, 1024, 512)
插值结果: (4096, 4096, 2048)
"""
(2)Numba-CPU加速
import numba
import numpy as np
@numba.jit()
def nearest_neighbor_interpolation(input_data, scale_factors, output_data):
"""最近邻插值算法
输入参数:3D图像 + 插值因子 + 预分配的输出数据
"""
# 对目标数组中的每个点进行插值
for z in range(output_data.shape[0]):
zz = min(round(z / scale_factors[0]), input_data.shape[0] - 1) # round四舍五入可能会超出数据边界的值
for y in range(output_data.shape[1]):
yy = min(round(y / scale_factors[1]), input_data.shape[1] - 1) # round四舍五入可能会超出数据边界的值
for x in range(output_data.shape[2]):
# 计算在原始数据中的对应位置,并限制在原始数据范围内
xx = min(round(x / scale_factors[2]), input_data.shape[2] - 1) # round四舍五入可能会超出数据边界的值
output_data[z, y, x] = input_data[zz, yy, xx] # 最近邻插值
return output_data
if __name__ == "__main__":
# (1)创建数组
input_data = np.zeros((1024, 1024, int(1024*0.5)), dtype=bool) # 创建3D数组
input_data[50:600, 200:1000, 5:30] = 1
# (2)插值计算
import time
start_time = time.time()
#############################################################
# 计算目标形状并创建目标数组
output_shape = np.round(np.array(input_data.shape) * np.array(scale_factors)).astype(int)
output_data = np.zeros(output_shape, dtype=input_data.dtype)
scale_factors = [4, 4, 4] # 指定插值因子
nearest_neighbor_interpolation(input_data, scale_factors, output_data) # 执行3D最近邻插值
#############################################################
print("计算时间:", time.time() - start_time)
print("获取非零元素的数量(输入):", np.count_nonzero(input_data))
print("获取非零元素的数量(输出):", np.count_nonzero(output_data))
print("原始数据:", input_data.shape)
print("插值结果:", output_data.shape)
"""
##### 插值两倍 #####
计算时间: 7.694857835769653
获取非零元素的数量(输入): 11000000
获取非零元素的数量(输出): 86240000
原始数据: (1024, 1024, 512)
插值结果: (2048, 2048, 1024)
##### 插值四倍 #####
计算时间: 56.58441758155823
获取非零元素的数量(输入): 11000000
获取非零元素的数量(输出): 696960000
原始数据: (1024, 1024, 512)
插值结果: (4096, 4096, 2048)
"""
(3)Numba-GPU加速
import numpy as np
from numba import cuda
@cuda.jit
def nearest_neighbor_interpolation_gpu(input_data, output_data, scale_factors):
"""
最近邻插值的CUDA版本
"""
z, y, x = cuda.grid(3) # 获取3D网格中线程的位置 (z, y, x)
if z < output_data.shape[0] and y < output_data.shape[1] and x < output_data.shape[2]:
zz = min(round(z / scale_factors[0]), input_data.shape[0] - 1)
yy = min(round(y / scale_factors[1]), input_data.shape[1] - 1)
xx = min(round(x / scale_factors[2]), input_data.shape[2] - 1)
output_data[z, y, x] = input_data[zz, yy, xx]
def gpu_nearest_neighbor_interpolation(input_data, scale_factors):
# 创建目标数组的形状
output_shape = np.round(input_data.shape * scale_factors).astype(int)
output_data = np.zeros(output_shape, dtype=input_data.dtype)
# 将输入数据从CPU上传到GPU
input_data_gpu = cuda.to_device(input_data)
output_data_gpu = cuda.to_device(output_data)
#######################################################################
# 定义线程和块的大小
threads_per_block = (16, 16, 4)
blocks_per_grid = ((output_shape[0] + threads_per_block[0] - 1) // threads_per_block[0],
(output_shape[1] + threads_per_block[1] - 1) // threads_per_block[1],
(output_shape[2] + threads_per_block[2] - 1) // threads_per_block[2])
# 执行CUDA核函数
nearest_neighbor_interpolation_gpu[blocks_per_grid, threads_per_block](input_data_gpu, output_data_gpu, scale_factors)
#######################################################################
# 将结果从GPU下载回CPU
output_data = output_data_gpu.copy_to_host()
return output_data
if __name__ == "__main__":
# (1)创建3D数组
input_data = np.zeros((1024, 1024, int(1024 * 0.5))).astype(bool)
input_data[50:600, 200:1000, 5:30] = 1
# (2)执行插值
import time
start_time = time.time()
#######################################################################
scale_factors = np.array([4, 4, 4]) # 指定插值因子
output_data = gpu_nearest_neighbor_interpolation(input_data, scale_factors)
#######################################################################
print("计算时间:", time.time() - start_time)
print("获取非零元素的数量(输入):", np.count_nonzero(input_data))
print("获取非零元素的数量(输出):", np.count_nonzero(output_data))
print("原始数据:", input_data.shape)
print("插值结果:", output_data.shape)
"""
##### 插值两倍 #####
计算时间: 3.6601688861846924
获取非零元素的数量(输入): 11000000
获取非零元素的数量(输出): 86240000
原始数据: (1024, 1024, 512)
插值结果: (2048, 2048, 1024)
##### 插值四倍 #####
计算时间: 122.51327633857727
获取非零元素的数量(输入): 11000000
获取非零元素的数量(输出): 696960000
原始数据: (1024, 1024, 512)
插值结果: (4096, 4096, 2048)
"""
(4)Numba-CPU加速(Z轴切块)
import numba
import numpy as np
@numba.jit()
def nearest_neighbor_interpolation(input_data, scale_factors, output_block):
# 对目标数组中的每个点进行插值
for z in range(output_block.shape[0]):
zz = min(round(z / scale_factors[0]), input_data.shape[0] - 1)
for y in range(output_block.shape[1]):
yy = min(round(y / scale_factors[1]), input_data.shape[1] - 1)
for x in range(output_block.shape[2]):
# 计算在原始数据中的对应位置,并限制在原始数据范围内
xx = min(round(x / scale_factors[2]), input_data.shape[2] - 1)
output_block[z, y, x] = input_data[zz, yy, xx] # 最近邻插值
return output_block
def segment_blocks(input_data, block_size, scale_factors):
num_blocks = int(np.ceil(input_data.shape[0] / block_size))
print("切块数量 =", num_blocks)
zoomed_blocks = []
for i in range(num_blocks):
start_idx = i * block_size
end_idx = min((i + 1) * block_size, input_data.shape[0])
block = input_data[start_idx:end_idx, :, :]
##########################################################################
output_shape = np.round(np.array(block.shape) * scale_factors).astype(int) # 计算目标形状
output_block = np.zeros(output_shape, dtype=block.dtype) # 创建目标形状的数组
zoomed_block = nearest_neighbor_interpolation(block, scale_factors, output_block)
# zoomed_block = scipy.ndimage.zoom(block, zoom_factor, order=1)
##########################################################################
zoomed_blocks.append(zoomed_block)
output_data = np.concatenate(zoomed_blocks, axis=0) # 沿着第一个轴进行拼接得到合并结果
return output_data
if __name__ == "__main__":
input_data = np.zeros((1024, 1024, int(1024 * 0.5))).astype(bool) # 创建一个示例的3D数组
input_data[50:600, 200:1000, 5:30] = 1
import time
start_time = time.time()
###############################################################
scale_factors = np.array([4, 4, 4]) # 指定插值因子
block_size = 100 # 切割小块的Z轴尺度,会自动分配并处理越界问题(切块数量 = input_data[0] / block_size)
output_data = segment_blocks(input_data, block_size, scale_factors) # 分块插值
##########################################################################
print("计算时间:", time.time() - start_time)
print("获取非零元素的数量(输入):", np.count_nonzero(input_data))
print("获取非零元素的数量(输出):", np.count_nonzero(output_data))
print("原始数据:", input_data.shape)
print("插值结果:", output_data.shape)
"""
##### 插值两倍 #####
切块数量 = 11
总共耗时: 6.803957223892212
Non-zero Count: 11000000
Non-zero Count: 86318400
原始数据: (1024, 1024, 512)
插值结果: (2048, 2048, 1024)
##### 插值四倍 #####
切块数量 = 11
计算时间: 52.44191575050354
获取非零元素的数量(输入): 11000000
获取非零元素的数量(输出): 697593600
原始数据: (1024, 1024, 512)
插值结果: (4096, 4096, 2048)
"""
(5)Numba-CPU加速(XYZ轴切块)
import numba
import numpy as np
@numba.jit()
def nearest_neighbor_interpolation(input_data, scale_factors, output_block):
for z in range(output_block.shape[0]):
zz = min(round(z / scale_factors[0]), input_data.shape[0] - 1)
for y in range(output_block.shape[1]):
yy = min(round(y / scale_factors[1]), input_data.shape[1] - 1)
for x in range(output_block.shape[2]):
xx = min(round(x / scale_factors[2]), input_data.shape[2] - 1)
output_block[z, y, x] = input_data[zz, yy, xx]
return output_block
def segment_blocks(input_data, block_sizes, scale_factors):
num_blocks_z = int(np.ceil(input_data.shape[0] / block_sizes[0]))
num_blocks_y = int(np.ceil(input_data.shape[1] / block_sizes[1]))
num_blocks_x = int(np.ceil(input_data.shape[2] / block_sizes[2]))
zoomed_blocks_zz = []
for z in range(num_blocks_z):
start_z = z * block_sizes[0]
end_z = min((z + 1) * block_sizes[0], input_data.shape[0])
zoomed_blocks_yy = []
for y in range(num_blocks_y):
start_y = y * block_sizes[1]
end_y = min((y + 1) * block_sizes[1], input_data.shape[1])
zoomed_blocks_xx = []
for x in range(num_blocks_x):
start_x = x * block_sizes[2]
end_x = min((x + 1) * block_sizes[2], input_data.shape[2])
block = input_data[start_z:end_z, start_y:end_y, start_x:end_x]
print(f"{num_blocks_z, num_blocks_y, num_blocks_x}/{z + 1, y + 1, x + 1}", "block.shape =", block.shape)
##########################################################################
output_shape = np.round(block.shape * scale_factors).astype(int)
output_block = np.zeros(output_shape, dtype=block.dtype)
zoomed_block_x = nearest_neighbor_interpolation(block, scale_factors, output_block)
##########################################################################
zoomed_blocks_xx.append(zoomed_block_x)
zoomed_block_y = np.concatenate(zoomed_blocks_xx, axis=2) # 沿着X轴拼接
zoomed_blocks_yy.append(zoomed_block_y)
zoomed_block_z = np.concatenate(zoomed_blocks_yy, axis=1) # 沿着Y轴拼接
zoomed_blocks_zz.append(zoomed_block_z)
zoomed_block = np.concatenate(zoomed_blocks_zz, axis=0) # 沿着Z轴拼接
print("切块数量 =", num_blocks_z * num_blocks_y * num_blocks_x)
return zoomed_block
if __name__ == "__main__":
input_data = np.zeros((1024, 1024, int(1024 * 0.5))).astype(bool)
input_data[50:600, 200:1000, 5:30] = 1
import time
start_time = time.time()
##########################################################################
scale_factors = np.array([4, 4, 4])
block_sizes = (100, 100, 100) # 小块的尺度,沿着z轴、y轴、x轴
zoomed_block = segment_blocks(input_data, block_sizes, scale_factors)
##########################################################################
print("计算时间:", time.time() - start_time)
print("获取非零元素的数量(输入):", np.count_nonzero(input_data))
print("获取非零元素的数量(输出):", np.count_nonzero(zoomed_block))
print("原始数据:", input_data.shape)
print("插值结果:", zoomed_block.shape)
"""
##### 插值两倍 #####
切块数量 = 726
计算时间: 9.834398746490479
获取非零元素的数量(输入): 11000000
获取非零元素的数量(输出): 86318400
原始数据: (1024, 1024, 512)
插值结果: (2048, 2048, 1024)
##### 插值四倍 #####
切块数量 = 726
计算时间: 72.6902551651001
获取非零元素的数量(输入): 11000000
获取非零元素的数量(输出): 697593600
原始数据: (1024, 1024, 512)
插值结果: (4096, 4096, 2048)
"""
(6)Numba-CPU加速(XYZ轴切块)+ 多线程
import numba
import numpy as np
import concurrent.futures
@numba.jit()
def nearest_neighbor_interpolation(input_data, scale_factors, output_block):
for z in range(output_block.shape[0]):
zz = min(round(z / scale_factors[0]), input_data.shape[0] - 1)
for y in range(output_block.shape[1]):
yy = min(round(y / scale_factors[1]), input_data.shape[1] - 1)
for x in range(output_block.shape[2]):
xx = min(round(x / scale_factors[2]), input_data.shape[2] - 1)
output_block[z, y, x] = input_data[zz, yy, xx]
return output_block
def process_block(block, scale_factors):
output_shape = np.round(block.shape * scale_factors).astype(int)
output_block = np.zeros(output_shape, dtype=block.dtype)
zoomed_block = nearest_neighbor_interpolation(block, scale_factors, output_block)
return zoomed_block
def segment_blocks(input_data, block_sizes, scale_factors):
num_blocks_z = int(np.ceil(input_data.shape[0] / block_sizes[0]))
num_blocks_y = int(np.ceil(input_data.shape[1] / block_sizes[1]))
num_blocks_x = int(np.ceil(input_data.shape[2] / block_sizes[2]))
zoomed_blocks = []
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = []
for z in range(num_blocks_z):
start_z = z * block_sizes[0]
end_z = min((z + 1) * block_sizes[0], input_data.shape[0])
for y in range(num_blocks_y):
start_y = y * block_sizes[1]
end_y = min((y + 1) * block_sizes[1], input_data.shape[1])
for x in range(num_blocks_x):
start_x = x * block_sizes[2]
end_x = min((x + 1) * block_sizes[2], input_data.shape[2])
##########################################################################
block = input_data[start_z:end_z, start_y:end_y, start_x:end_x]
futures.append(executor.submit(process_block, block, scale_factors))
##########################################################################
# for future in concurrent.futures.as_completed(futures):
# zoomed_blocks.append(future.result())
print("切块数量 =", num_blocks_z * num_blocks_y * num_blocks_x)
return zoomed_blocks
if __name__ == "__main__":
input_data = np.zeros((1024, 1024, int(1024 * 0.5))).astype(bool)
input_data[50:600, 200:1000, 5:30] = 1
import time
start_time = time.time()
##########################################################################
scale_factors = np.array([4, 4, 4])
block_sizes = (100, 100, 100) # 小块的尺度,沿着z轴、y轴、x轴
output_data = segment_blocks(input_data, block_sizes, scale_factors)
##########################################################################
print("计算时间:", time.time() - start_time)
print("获取非零元素的数量(输入):", np.count_nonzero(input_data))
# print("获取非零元素的数量(输出):", np.count_nonzero(output_data))
print("原始数据:", input_data.shape)
# print("插值结果:", output_data.shape)
"""
##### 插值两倍 #####
切块数量 = 726
计算时间: 6.778625011444092
获取非零元素的数量(输入): 11000000
原始数据: (1024, 1024, 512)
##### 插值四倍 #####
切块数量 = 726
计算时间: 50.20111918449402
获取非零元素的数量(输入): 11000000
原始数据: (1024, 1024, 512)
"""