简介

要使用仿射变换完成图像等宽高比缩放，可以按照以下步骤进行操作：

定义仿射变换矩阵：首先，定义一个仿射变换矩阵，用于描述缩放操作。该矩阵是一个2x3的矩阵，包含平移、缩放和旋转的参数。对于缩放而言，我们只需要调整矩阵中的缩放参数。

其中，scale_x 和 scale_y 分别表示在 x 和 y 方向上的缩放因子。
构建仿射变换矩阵：使用上述定义的缩放因子，将其填充到仿射变换矩阵的相应位置。
应用仿射变换：对于每个图像像素，根据仿射变换矩阵的逆矩阵将其映射到原始图像中的位置。然后，使用插值方法计算映射后位置的像素值。

常用的插值方法包括最近邻插值、双线性插值和双三次插值。根据具体需求和性能要求，选择合适的插值方法。
创建新图像：根据变换后的坐标和插值结果，生成新的缩放后的图像。

本文使用的插值方法是双线性插值。

宿主机程序

#include <iostream>
#include <fstream>
#include <sstream>
#include <string.h>
#include <algorithm>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#include <opencv2/opencv.hpp>
struct AffineMatrix{
    float i2d[6];   
    float d2i[6];  
    void invertAffineTransform(float imat[6], float omat[6]){
        float i00 = imat[0];  float i01 = imat[1];  float i02 = imat[2];
        float i10 = imat[3];  float i11 = imat[4];  float i12 = imat[5];
        float D = i00 * i11 - i01 * i10;
        D = D != 0 ? 1.0 / D : 0;
        float A11 = i11 * D;
        float A22 = i00 * D;
        float A12 = -i01 * D;
        float A21 = -i10 * D;
        float b1 = -A11 * i02 - A12 * i12;
        float b2 = -A21 * i02 - A22 * i12;
        omat[0] = A11;  omat[1] = A12;  omat[2] = b1;
        omat[3] = A21;  omat[4] = A22;  omat[5] = b2;
    }

    void compute(const cv::Size& from, const cv::Size& to){
        float scale_x = to.width / (float)from.width;
        float scale_y = to.height / (float)from.height;
        float scale = std::min(scale_x, scale_y); 
        i2d[0] = scale;  i2d[1] = 0;  i2d[2] = 
            -scale * from.width  * 0.5  + to.width * 0.5 + scale * 0.5 - 0.5;

        i2d[3] = 0;  i2d[4] = scale;  i2d[5] = 
            -scale * from.height * 0.5 + to.height * 0.5 + scale * 0.5 - 0.5;

        invertAffineTransform(i2d, d2i);
    }
};
//在第一个平台中创建只包括GPU的上下文
cl_context CreateContext()
{
    cl_int errNum;
    cl_uint numPlatforms;
    cl_platform_id firstPlatformId;
    cl_context context = NULL;
 
    // 选择第一个平台
    errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
    if (errNum != CL_SUCCESS || numPlatforms <= 0)
    {
        std::cerr << "Failed to find any OpenCL platforms." << std::endl;
        return NULL;
    }
 
    // 接下来尝试通过GPU设备建立上下文
    cl_context_properties contextProperties[] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)firstPlatformId,
        0
    };
    context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU,
                                      NULL, NULL, &errNum);
    if (errNum != CL_SUCCESS)
    {
        std::cout << "Could not create GPU context, trying CPU..." << std::endl;
        context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU,
                                          NULL, NULL, &errNum);
        if (errNum != CL_SUCCESS)
        {
            std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;
            return NULL;
        }
    }
 
    return context;
}
 
//在第一个设备上创建命令队列
cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *device)
{
    cl_int errNum;
    cl_device_id *devices;
    cl_command_queue commandQueue = NULL;
    size_t deviceBufferSize = -1;
 
    // 首先获得设备的信息
    errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Failed call to clGetContextInfo(...,GL_CONTEXT_DEVICES,...)";
        return NULL;
    }
 
    if (deviceBufferSize <= 0)
    {
        std::cerr << "No devices available.";
        return NULL;
    }
 
    //为设备分配内存
    devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
    errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Failed to get device IDs";
        return NULL;
    }
 
    // 选择第一个设备并为其创建命令队列
    cl_queue_properties properties[] = {0};
    commandQueue = clCreateCommandQueueWithProperties(context, devices[0], properties, NULL);
    if (commandQueue == NULL)
    {
        std::cerr << "Failed to create commandQueue for device 0";
        return NULL;
    }
    
    //释放信息
    *device = devices[0];
    delete [] devices;
    return commandQueue;
}
 
//  创建OpenCL程序对象
cl_program CreateProgram(cl_context context, cl_device_id device, const char* fileName)
{
    cl_int errNum;
    cl_program program;
 
    std::ifstream kernelFile(fileName, std::ios::in);
    if (!kernelFile.is_open())
    {
        std::cerr << "Failed to open file for reading: " << fileName << std::endl;
        return NULL;
    }
 
    std::ostringstream oss;
    oss << kernelFile.rdbuf();
 
    std::string srcStdStr = oss.str();
    const char *srcStr = srcStdStr.c_str();
    program = clCreateProgramWithSource(context, 1,
                                        (const char**)&srcStr,
                                        NULL, NULL);
    if (program == NULL)
    {
        std::cerr << "Failed to create CL program from source." << std::endl;
        return NULL;
    }
 
    errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (errNum != CL_SUCCESS)
    {
        // 输出错误信息
        char buildLog[16384];
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
                              sizeof(buildLog), buildLog, NULL);
 
        std::cerr << "Error in kernel: " << std::endl;
        std::cerr << buildLog;
        clReleaseProgram(program);
        return NULL;
    }
 
    return program;
}
 
 
//清除资源
void Cleanup(cl_context context, cl_command_queue commandQueue,
             cl_program program, cl_kernel kernel, cl_mem imageObjects[2],
             cl_sampler sampler)
{
    for (int i = 0; i < 2; i++)
    {
        if (imageObjects[i] != 0)
            clReleaseMemObject(imageObjects[i]);
    }
    if (commandQueue != 0)
        clReleaseCommandQueue(commandQueue);
 
    if (kernel != 0)
        clReleaseKernel(kernel);
 
    if (program != 0)
        clReleaseProgram(program);
 
    if (sampler != 0)
        clReleaseSampler(sampler);
 
    if (context != 0)
        clReleaseContext(context);
 
}
const char* GetOpenCLErrorString(cl_int errorCode)
{
    switch (errorCode) {
        case CL_SUCCESS:
            return "CL_SUCCESS";
        case CL_DEVICE_NOT_FOUND:
            return "CL_DEVICE_NOT_FOUND";
        case CL_INVALID_VALUE:
            return "CL_INVALID_VALUE";
        // 其他错误码的处理
        default:
            return "Unknown error code";
    }
}

cl_mem LoadImage(cl_context context, char* fileName, int& width, int& height)
{
    cv::Mat image = cv::imread(fileName, cv::IMREAD_COLOR);
    
    if (image.empty())
    {
        std::cerr << "Error loading image" << std::endl;
        return 0;
    }
    
    /* 修改：将图像数据从 BGR 转换为 RGBA 格式 */
    //一般图像算法都为rgba的格式
    cv::cvtColor(image, image, cv::COLOR_BGR2RGBA);

    width = image.cols;
    height = image.rows;

    cl_image_format clImageFormat;
    clImageFormat.image_channel_order = CL_RGBA;
    clImageFormat.image_channel_data_type = CL_UNSIGNED_INT8;

    cl_int errNum;
    cl_mem clImage;

    cl_image_desc clImageDesc;
    memset(&clImageDesc, 0, sizeof(cl_image_desc));
    clImageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;
    clImageDesc.image_width = width;
    clImageDesc.image_height = height;
    clImageDesc.image_row_pitch = 0;
    clImageDesc.image_slice_pitch = 0;
    clImageDesc.num_mip_levels = 0;
    clImageDesc.num_samples = 0;
    clImageDesc.image_depth = 1;

    /* 移除：不再需要使用缓冲区 */

    clImage = clCreateImage(
        context,
        CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
        &clImageFormat,
        &clImageDesc,
        image.data,
        &errNum
    );
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error creating CL image object:" << GetOpenCLErrorString(errNum) << std::endl;
        return 0;
    }

    return clImage;
}

void saveRGBAtoJPG(const char* filename, const char* buffer, int width, int height) {
    cv::Mat image(height, width, CV_8UC4, (void*)buffer);
    // 转换 RGBA 到 BGR 格式
    cv::cvtColor(image, image, cv::COLOR_RGBA2BGR);
    // 保存图像为 JPG 文件
    cv::imwrite(filename, image);
}


//获取最接近的倍数
//任务均分：在并行计算中，经常需要将一个较大的任务或数据集分成多个小任务或数据块，分配给不同的处理单元并行执行。
//使用 RoundUp 函数可以将总任务数 globalSize 向上舍入到 groupSize 的倍数，确保每个处理单元都获得相等的任务数，避免了任务不均衡的情况。

//内存对齐：在一些场景下，为了提高内存访问的效率，需要将数据按照一定的对齐方式存储，即确保数据的起始地址和长度都是某个特定数值的倍数。
//通过使用 RoundUp 函数，可以将数据长度 globalSize 向上舍入到 groupSize 的倍数，以满足对齐的要求，从而获得更好的内存访问性能。
size_t RoundUp(int groupSize, int globalSize)
{
    int r = globalSize % groupSize;
    if(r == 0)
    {
         return globalSize;
    }
    else
    {
         return globalSize + groupSize - r;
    }
}
// 创建输出的图像对象
cl_mem CreateOutputImage(cl_context context, int width, int height)
{
    cl_image_format clImageFormat;
    clImageFormat.image_channel_order = CL_RGBA;
    clImageFormat.image_channel_data_type = CL_UNSIGNED_INT8;

    cl_int errNum;
    cl_mem clImage;

    cl_image_desc clImageDesc;
    memset(&clImageDesc, 0, sizeof(cl_image_desc));
    clImageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;
    clImageDesc.image_width = width;
    clImageDesc.image_height = height;
    clImageDesc.image_row_pitch = 0;
    clImageDesc.image_slice_pitch = 0;
    clImageDesc.num_mip_levels = 0;
    clImageDesc.num_samples = 0;
    clImageDesc.image_depth = 1;

    clImage = clCreateImage(
        context,
        CL_MEM_WRITE_ONLY,
        &clImageFormat,
        &clImageDesc,
        NULL,
        &errNum
    );





    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error creating output CL image object: " << GetOpenCLErrorString(errNum) << std::endl;
        return 0;
    }

    return clImage;
}
 
int main()
{
    cl_context context = 0;
    cl_command_queue commandQueue = 0;
    cl_program program = 0;
    cl_device_id device = 0;
    cl_kernel kernel = 0;
    //这段代码定义了一个长度为 2 的 cl_mem 数组 imageObjects，并初始化所有元素为 0。
    cl_mem imageObjects[2] = { 0, 0 };
    //图像采样器 (cl_sampler) 可以与图像对象 (cl_mem) 一起使用，
    //用于在内核函数中从图像中获取特定位置像素的值。它控制着采样的方式，
    //以及在读取图像时如何处理越界的、边界问题，以及如何进行插值以获得平滑的结果。
    cl_sampler sampler = 0;
    cl_int errNum;
 
 
 
 
    // 创建上下文
    context = CreateContext();
    if (context == NULL)
    {
        std::cerr << "Failed to create OpenCL context." << std::endl;
        return 1;
    }
 
    // 创建命令队列
    commandQueue = CreateCommandQueue(context, &device);
    if (commandQueue == NULL)
    {
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }
 
    // 确保设备支持这种图像格式
    cl_bool imageSupport = CL_FALSE;
    clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool),
                    &imageSupport, NULL);
    if (imageSupport != CL_TRUE)
    {
        std::cerr << "OpenCL device does not support images." << std::endl;
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }
 
    // 加载图像
    int width, height;
    char* imagePath = "/work/myopencl/build/test.jpg";
    imageObjects[0] = LoadImage(context, imagePath, width, height);
    if (imageObjects[0] == 0)
    {
        std::cerr << "Error loading: " << std::string("123.png") << std::endl;
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }
 
    // 创建输出的图像对象
    int outputWidth=640;
    int outputHeight=640;
    imageObjects[1] = CreateOutputImage(context,outputWidth,outputHeight);
    if(imageObjects[1] == 0)
    {
        std::cerr << "Error creating CL output image object." << std::endl;
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }

    AffineMatrix affine;
    affine.compute(cv::Size(width, height), cv::Size(outputWidth, outputHeight));

    // 创建一个包含 float 数组数据的缓冲区对象
    cl_mem kbuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * 6, affine.d2i, NULL);

    // 创建采样器对象
    //寻址模式（Addressing Mode）是一种定义在像素采样过程中如何处理超出纹理边界的采样坐标的方法。它决定了当采样坐标超过纹理边界时，如何获取纹理中的值。
    //CL_FALSE：这是一个布尔值参数，表示使用非规范化的采样坐标。非规范化坐标意味着采样坐标在整数范围内，而不是标准化到 [0, 1] 的范围。
    //CL_ADDRESS_CLAMP_TO_EDGE：这个参数指定了图像采样器的地址模式，CL_ADDRESS_CLAMP_TO_EDGE 表示超出图像边界的采样坐标将被截断到最近的边缘像素的颜色值。
    //CL_FILTER_NEAREST：这个参数指定了图像采样器的过滤方式，CL_FILTER_NEAREST 表示使用最近邻插值，也就是返回与采样坐标最近的像素值，不进行插值计算。
    cl_sampler_properties samplerProps[] = {
        CL_SAMPLER_NORMALIZED_COORDS, CL_FALSE, // 非规范化坐标
        CL_SAMPLER_ADDRESSING_MODE, CL_ADDRESS_CLAMP_TO_EDGE, // 寻址模式为 CL_ADDRESS_CLAMP_TO_EDGE
        CL_SAMPLER_FILTER_MODE, CL_FILTER_NEAREST, // 过滤模式为 CL_FILTER_NEAREST
        0 // 列表结束符
    };

    sampler = clCreateSamplerWithProperties(context, samplerProps, &errNum);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Failed to create sampler: " << GetOpenCLErrorString(errNum) << std::endl;
        return 0;
    }
 
    // 创建OpenCL程序对象
    program = CreateProgram(context, device, "/work/myopencl/resource/mywork.cl");
    if (program == NULL)
    {
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }
 
    // 创建OpenCL核
    kernel = clCreateKernel(program, "affine_transformation", NULL);
    if (kernel == NULL)
    {
        std::cerr << "Failed to create kernel" << std::endl;
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }
 
    // 设定参数
    //该操作的目的是将 clSetKernelArg 函数的返回值（错误码）累积到 errNum 变量中
    errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &imageObjects[0]);
    errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &imageObjects[1]);
    errNum |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler);
    //取的只是传递参数的大小
    errNum |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&kbuffer);
    //传递宽高：
    errNum |= clSetKernelArg(kernel, 4, sizeof(int), &width);
    errNum |= clSetKernelArg(kernel, 5, sizeof(int), &height);
    errNum |= clSetKernelArg(kernel, 6, sizeof(int), &outputWidth);
    errNum |= clSetKernelArg(kernel, 7, sizeof(int), &outputHeight);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error setting kernel arguments." << std::endl;
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }
 
    size_t globalWorkSize[2] = { 10000,10000 };
    // size_t localWorkSize[2] =  { RoundUp(localWorkSize[0], width),
    //                               RoundUp(localWorkSize[1], height) };
 
    // 将内核排队
    errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL,
                                    globalWorkSize, NULL,
                                    0, NULL, NULL);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error queuing kernel for execution." << std::endl;
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }
 
    // 将输出缓冲区读回主机
    char *buffer = new char [outputWidth * outputHeight * 4];
    size_t origin[3] = { 0, 0, 0 };
    size_t region[3] = { size_t(outputWidth), size_t(outputHeight), 1};
    //在 OpenCL 中，clEnqueueReadImage 函数用于从图像对象中读取数据到主机内存。该函数的 origin 和 region 参数用于指定要读取的区域。
    //origin 参数是一个包含三个元素的数组，即 [x, y, z]，指定了要读取的起始位置在图像中的坐标。
    //origin[0] 表示 x 坐标，origin[1] 表示 y 坐标，origin[2] 表示 z 坐标。对于二维图像，我们通常将 origin[2] 设置为 0。
    //region 参数也是一个包含三个元素的数组，即 [width, height, depth]，指定了要读取的区域的尺寸。
    //region[0] 表示区域的宽度，region[1] 表示区域的高度，region[2] 表示区域的深度。对于二维图像，我们可以将 region[2] 设置为 1
    errNum = clEnqueueReadImage(commandQueue, imageObjects[1], CL_TRUE,
                                origin, region, 0, 0, buffer,
                                0, NULL, NULL);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error reading result buffer." << std::endl;
        Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
        return 1;
    }
 
    std::cout << std::endl;
    std::cout << "Executed program succesfully." << std::endl;
 
    //保存输出图像
    saveRGBAtoJPG("result.jpg", buffer, outputWidth, outputHeight);
    delete [] buffer;
    Cleanup(context, commandQueue, program, kernel, imageObjects, sampler);
    return 0;
}

设备端函数

__kernel void affine_transformation(__read_only image2d_t srcImg,
                                    __write_only image2d_t dstImg,
                                    sampler_t sampler,
                                    __global float* mykernel,
                                    int srcWidth,
                                    int srcHeight,
                                    int dstWidth,
                                    int dstHeight)

{

    int dstx = get_global_id(0);
    int dsty = get_global_id(1);
    if (dstx >= dstWidth || dsty >= dstHeight)  return;
    //默认填充灰色：
    float c0 = 114, c1 = 114, c2 = 114;
    float src_x = 0; float src_y = 0;

    src_x = mykernel[0] * dstx + mykernel[1] * dsty + mykernel[2];
    src_y = mykernel[3] * dstx + mykernel[4] * dsty + mykernel[5];
    //printf("%f,%f\n",src_x,src_y);
    if(src_x < -1 || src_x >= srcWidth || src_y < -1 || src_y >= srcHeight)
    {
    }
    else
    {
        int y_low = floor(src_y);
        int x_low = floor(src_x);
        int y_high = y_low + 1;
        int x_high = x_low + 1;
        int fill_value=114;
        float ly    = src_y - y_low;
        float lx    = src_x - x_low;
        float hy    = 1 - ly;
        float hx    = 1 - lx;
        float w1    = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
        float4 v1=(float4)(fill_value, fill_value, fill_value,1.0);
        float4 v2=(float4)(fill_value, fill_value, fill_value,1.0);
        float4 v3=(float4)(fill_value, fill_value, fill_value,1.0);
        float4 v4=(float4)(fill_value, fill_value, fill_value,1.0);
        if(y_low >= 0){
            if (x_low >= 0)
                v1=convert_float4(read_imageui(srcImg, sampler, (int2)(x_low, y_low)));
            if (x_high < srcWidth)
                v2=convert_float4(read_imageui(srcImg, sampler, (int2)(x_high, y_low)));
        }
        
        if(y_high < srcHeight){
            if (x_low >= 0)
                v3=convert_float4(read_imageui(srcImg, sampler, (int2)(x_low, y_high)));
            if (x_high < srcWidth)
                v4=convert_float4(read_imageui(srcImg, sampler, (int2)(x_high, y_high)));
        }

        c0 = floor(w1 * v1.x + w2 * v2.x + w3 * v3.x + w4 * v4.x + 0.5f);
        c1 = floor(w1 * v1.y + w2 * v2.y + w3 * v3.y + w4 * v4.y + 0.5f);
        c2 = floor(w1 * v1.z + w2 * v2.z + w3 * v3.z + w4 * v4.z + 0.5f);
       // printf("%f\n",convert_float4(read_imageui(srcImg, sampler, (int2)(3,3)).x);
    }

    int2 dstindex= (int2) (dstx, dsty);
    float4 outColor = (float4)(c0, c1, c2, 1.0f);
    int4 clampedPixel = convert_int4(clamp(outColor, 0.0f, 255.0f));
    write_imagei(dstImg, dstindex, clampedPixel);
}