1.行列分解数学原理
row_filter=[1 0 -1], col_filter=[1 2 1]
row_filter=[1 2 1], col_filter=[1 0 -1]
2.非局部内存实现
__kernel void sobel_filter_separable(__global uchar* padSrc, __global uchar* dst, int height, int width, int pad_width)
{
__local short local_output_x[LOCAL_XRES * (LOCAL_YRES + FILTERSIZE - 1)];
__local short local_output_y[LOCAL_XRES * (LOCAL_YRES + FILTERSIZE - 1)];
uint col = get_global_id(0);
uint row = get_global_id(1);
if (col >= width || row >= height) return;
int lid_x = get_local_id(0);
int lid_y = get_local_id(1);
int start_col = col;
/* row-wise */
// row_filterx = [1 0 -1]
local_output_x[lid_y * LOCAL_XRES + lid_x] = padSrc[row * pad_width + col] - padSrc[row * pad_width + col + 2];
// row_filtery = [1 2 1]
local_output_y[lid_y * LOCAL_XRES + lid_x] = padSrc[row * pad_width + col] + padSrc[row * pad_width + col + 1] * 2 + padSrc[row * pad_width + col + 2];
if (lid_y < FILTERSIZE - 1) {
local_output_x[(lid_y + LOCAL_YRES) * LOCAL_XRES + lid_x] = padSrc[(row + LOCAL_YRES) * pad_width + col] - padSrc[(row + LOCAL_YRES) * pad_width + col + 2];
local_output_y[(lid_y + LOCAL_YRES) * LOCAL_XRES + lid_x] = padSrc[(row + LOCAL_YRES) * pad_width + col] + padSrc[(row + LOCAL_YRES) * pad_width + col + 1] * 2 + padSrc[(row + LOCAL_YRES) * pad_width + col + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
/* col-wise */
// col_filterx = [1 2 1]
ushort sumx = abs(local_output_x[lid_y * LOCAL_XRES + lid_x] + local_output_x[(lid_y + 1) * LOCAL_XRES + lid_x] * 2 + local_output_x[(lid_y + 2) * LOCAL_XRES + lid_x]);
// col_filtery = [1 0 -1]
ushort sumy = abs(local_output_y[lid_y * LOCAL_XRES + lid_x] - local_output_y[(lid_y + 2) * LOCAL_XRES + lid_x]);
dst[row * width + col] = clamp((convert_uchar)(sumx + sumy), (uchar)0, (uchar)255);
}
3.局部内存实现
__kernel void sobel_filter_separable_lds(__global uchar* padSrc, __global uchar* dst, int height, int width, int pad_width)
{
__local short local_output_x[LOCAL_XRES * (LOCAL_YRES + FILTERSIZE - 1)];
__local short local_output_y[LOCAL_XRES * (LOCAL_YRES + FILTERSIZE - 1)];
__local uchar local_input[(LOCAL_XRES + FILTERSIZE - 1) * (LOCAL_YRES + FILTERSIZE - 1)];
uint col = get_global_id(0);
uint row = get_global_id(1);
if (col >= width || row >= height) return;
int lid_x = get_local_id(0);
int lid_y = get_local_id(1);
int tile_xres = (LOCAL_XRES + FILTERSIZE - 1);
int tile_yres = (LOCAL_YRES + FILTERSIZE - 1);
int start_col = get_group_id(0) * LOCAL_XRES;
int start_row = get_group_id(1) * LOCAL_YRES;
int lid = lid_y * LOCAL_XRES + lid_x;
int gx, gy;
do {
gy = lid / tile_xres;
gx = lid - gy * tile_xres;
local_input[lid] = padSrc[(start_row + gy) * pad_width + (start_col + gx)];
lid += (LOCAL_XRES * LOCAL_YRES);
} while (lid < (tile_xres * tile_yres));
barrier(CLK_LOCAL_MEM_FENCE);
/* row-wise */
// row_filterx = [1 0 -1]
local_output_x[lid_y * LOCAL_XRES + lid_x] = local_input[lid_y * tile_xres + lid_x] - local_input[lid_y * tile_xres + lid_x + 2];
// row_filtery = [1 2 1]
local_output_y[lid_y * LOCAL_XRES + lid_x] = local_input[lid_y * tile_xres + lid_x] + local_input[lid_y * tile_xres + lid_x + 1] * 2 + local_input[lid_y * tile_xres + lid_x + 2];
if (lid_y < FILTERSIZE - 1) {
local_output_x[(lid_y + LOCAL_YRES) * LOCAL_XRES + lid_x] = local_input[(lid_y + LOCAL_YRES) * tile_xres + lid_x] - local_input[(lid_y + LOCAL_YRES) * tile_xres + lid_x + 2];
local_output_y[(lid_y + LOCAL_YRES) * LOCAL_XRES + lid_x] = local_input[(lid_y + LOCAL_YRES) * tile_xres + lid_x] + local_input[(lid_y + LOCAL_YRES) * tile_xres + lid_x + 1] * 2 + local_input[(lid_y + LOCAL_YRES) * tile_xres + lid_x + 2];
}
barrier(CLK_LOCAL_MEM_FENCE);
/* col-wise */
// col_filterx = [1 2 1]
ushort sumx = abs(local_output_x[lid_y * LOCAL_XRES + lid_x] + local_output_x[(lid_y + 1) * LOCAL_XRES + lid_x] * 2 + local_output_x[(lid_y + 2) * LOCAL_XRES + lid_x]);
// col_filtery = [1 0 -1]
ushort sumy = abs(local_output_y[lid_y * LOCAL_XRES + lid_x] - local_output_y[(lid_y + 2) * LOCAL_XRES + lid_x]);
dst[row * width + col] = clamp((convert_uchar)(sumx + sumy), (uchar)0, (uchar)255);
}
输入数据局部内存初始化原理参考下图