初始状态
rotate
版本I
因为本题步步都在寻址而寻址的目标每一步又不同,并且在一个地址的值一次就赋值完毕,不会对同一个地址进行二次寻址,所以我首先想的改进方向就是使得寻址更加快速,于是我使得寻址的地址更加连续。
int i, j, t,n;
n = dim - 1;
for (j = 0; j < dim; j++){
t = (n-j)*dim;
for (i = 0; i < dim; i++)
dst[t+i] = src[RIDX(i, j, dim)];
}
版本II
为了降低CPE,可以降低cache miss,所以可以用块化即通过不断使用一些数据块,而不是完整地遍历一行和一列,来改进空间局部性。
int i,j,ki,kj;
for (i = 0; i < dim; i+=8)
for (j = 0; j < dim; j+=8)
for(ki=i; ki<i+8; ki++)
for(kj=j; kj<j+8; kj++)
dst[RIDX(dim-1-kj, ki, dim)] = src[RIDX(ki, kj, dim)];
Rotate的Summary由5.0提高至7.9,Dim规模较小时CPE优化不明显,当Dim规模较大时CPE明显下降
版本III
将前两种方法结合
int i, j, a, b, t;
int sdim = dim - 1;
for (i = 0; i < dim; i += 8)
{
for (j = 0; j < dim; j += 8)
{
for (b = j; b < j + 8; b++)
{
t = (sdim - b)*dim;
for (a = i; a < i + 8; a++)
{
dst[t+a] = src[RIDX(a, b, dim)];
}
}
}
}
版本IV
考虑到程序过多次调用RIDX函数,故消除该函数的调用。此外,改善读写顺序。具体来说,先处理矩阵第一列的前32个元素,再处理第二列前32个元素,以此类推直到处理完毕矩阵的前32行,再以相同的方法继续处理余下的矩阵元素。
int i,j,k;
for (i = 0; i < dim; i+=32)
for (j = 0; j < dim; j++)
for(k=0; k<32; k++) {
dst[(dim-1-j)*dim+i+k] = src[(i+k)*dim+j];
}
最终版本
smooth
版本I
虽然不同位置的像素点需要取相邻的不同数目的像素点的平均值,但数目只有4、6、9。对于四个顶点,取相邻四个像素点的平均值;对于和顶点接壤的像素点,取相邻六个像素点的平均值;剩下的取相邻九个像素点的平均值。
int i=1,j=0;
//左上角
dst[0].red=(src[0].red+src[1].red+src[dim].red+src[dim+1].red)/4;
dst[0].green=(src[0].green+src[1].green+src[dim].green+src[dim+1].green)/4;
dst[0].blue=(src[0].blue+src[1].blue+src[dim].blue+src[dim+1].blue)/4;
//第一行其他非右上角顶点
for(j=1; j<dim-1; j++) {
dst[j].red=(src[j-1].red+src[j].red+src[j+1].red+src[dim+j-1].red+src[dim+j].red+src[dim+j+1].red)/6;
dst[j].green=(src[j-1].green+src[j].green+src[j+1].green+src[dim+j-1].green+src[dim+j].green+src[dim+j+1].green)/6;
dst[j].blue=(src[j-1].blue+src[j].blue+src[j+1].blue+src[dim+j-1].blue+src[dim+j].blue+src[dim+j+1].blue)/6;
}
//右上角顶点
dst[j].red=(src[j].red+src[j-1].red+src[dim+j].red+src[dim+j-1].red)/4;
dst[j].green=(src[j].green+src[j-1].green+src[dim+j].green+src[dim+j-1].green)/4;
dst[j].blue=(src[j].blue+src[j-1].blue+src[dim+j].blue+src[dim+j-1].blue)/4;
//1至dim-2行
for(; i<dim-1; i++) {
//每行第一个像素点
dst[i*dim].red=(src[(i-1)*dim].red+src[(i-1)*dim+1].red+src[i*dim].red+src[i*dim+1].red+src[(i+1)*dim].red+src[(i+1)*dim+1].red)/6;
dst[i*dim].green=(src[(i-1)*dim].green+src[(i-1)*dim+1].green+src[i*dim].green+src[i*dim+1].green+src[(i+1)*dim].green+src[(i+1)*dim+1].green)/6;
dst[i*dim].blue=(src[(i-1)*dim].blue+src[(i-1)*dim+1].blue+src[i*dim].blue+src[i*dim+1].blue+src[(i+1)*dim].blue+src[(i+1)*dim+1].blue)/6;
//每行第二个至第dim-1个像素点
for(j=1; j<dim-1; j++) {
dst[i*dim+j].red=(src[(i-1)*dim+j-1].red+src[(i-1)*dim+j].red+src[(i-1)*dim+j+1].red+src[i*dim+j-1].red+src[i*dim+j].red+src[i*dim+j+1].red+src[(i+1)*dim+j-1].red+src[(i+1)*dim+j].red+src[(i+1)*dim+j+1].red)/9;
dst[i*dim+j].green=(src[(i-1)*dim+j-1].green+src[(i-1)*dim+j].green+src[(i-1)*dim+j+1].green+src[i*dim+j-1].green+src[i*dim+j].green+src[i*dim+j+1].green+src[(i+1)*dim+j-1].green+src[(i+1)*dim+j].green+src[(i+1)*dim+j+1].green)/9;
dst[i*dim+j].blue=(src[(i-1)*dim+j-1].blue+src[(i-1)*dim+j].blue+src[(i-1)*dim+j+1].blue+src[i*dim+j-1].blue+src[i*dim+j].blue+src[i*dim+j+1].blue+src[(i+1)*dim+j-1].blue+src[(i+1)*dim+j].blue+src[(i+1)*dim+j+1].blue)/9;
}
//每行最后一个像素点
dst[i*dim+j].red=(src[(i-1)*dim+j-1].red+src[(i-1)*dim+j].red+src[i*dim+j-1].red+src[i*dim+j].red+src[(i+1)*dim+j-1].red+src[(i+1)*dim+j].red)/6;
dst[i*dim+j].green=(src[(i-1)*dim+j-1].green+src[(i-1)*dim+j].green+src[i*dim+j-1].green+src[i*dim+j].green+src[(i+1)*dim+j-1].green+src[(i+1)*dim+j].green)/6;
dst[i*dim+j].blue=(src[(i-1)*dim+j-1].blue+src[(i-1)*dim+j].blue+src[i*dim+j-1].blue+src[i*dim+j].blue+src[(i+1)*dim+j-1].blue+src[(i+1)*dim+j].blue)/6;
}
//左下角
dst[i*dim].red=(src[(i-1)*dim].red+src[(i-1)*dim+1].red+src[i*dim].red+src[i*dim+1].red)/4;
dst[i*dim].green=(src[(i-1)*dim].green+src[(i-1)*dim+1].green+src[i*dim].green+src[i*dim+1].green)/4;
dst[i*dim].blue=(src[(i-1)*dim].blue+src[(i-1)*dim+1].blue+src[i*dim].blue+src[i*dim+1].blue)/4;
//最后一行非左下角、非右下角的像素点
for(j=1; j<dim-1; j++) {
dst[i*dim+j].red=(src[(i-1)*dim+j-1].red+src[(i-1)*dim+j].red+src[(i-1)*dim+j+1].red+src[i*dim+j-1].red+src[i*dim+j].red+src[i*dim+j+1].red)/6;
dst[i*dim+j].green=(src[(i-1)*dim+j-1].green+src[(i-1)*dim+j].green+src[(i-1)*dim+j+1].green+src[i*dim+j-1].green+src[i*dim+j].green+src[i*dim+j+1].green)/6;
dst[i*dim+j].blue=(src[(i-1)*dim+j-1].blue+src[(i-1)*dim+j].blue+src[(i-1)*dim+j+1].blue+src[i*dim+j-1].blue+src[i*dim+j].blue+src[i*dim+j+1].blue)/6;
}
//右下角像素点
dst[i*dim+j].red=(src[(i-1)*dim+j-1].red+src[(i-1)*dim+j].red+src[i*dim+j-1].red+src[i*dim+j].red)/4;
dst[i*dim+j].green=(src[(i-1)*dim+j-1].green+src[(i-1)*dim+j].green+src[i*dim+j-1].green+src[i*dim+j].green)/4;
dst[i*dim+j].blue=(src[(i-1)*dim+j-1].blue+src[(i-1)*dim+j].blue+src[i*dim+j-1].blue+src[i*dim+j].blue)/4;
版本II
由于上个版本程序运行过程中比较多的重复计算区域,基于动态规划的思想,将每一个像素点的计算转换为一个块(2x2或2x3或3x2或3x3)内的各个像素点取平均值,并将每一块纵向分开为2或3个纵向块,用动规数组记录每一列(2个或3个像素点)的RGB之和,其中相邻的两个纵向块之间的递推关系通式为:dp[i][j]=dp[i-1][j]-src[(i-1)*dim+j]+src[(i+2)*dim+j]
int i,j;
int r2[2][dim],g2[2][dim],b2[2][dim];
int r3[dim][dim],g3[dim][dim],b3[dim][dim];
for(j=0; j<dim; j++) {
//第j列一开始的大小为2的子块
r2[0][j]=src[j].red;
g2[0][j]=src[j].green;
b2[0][j]=src[j].blue;
r2[0][j]+=src[dim+j].red;
g2[0][j]+=src[dim+j].green;
b2[0][j]+=src[dim+j].blue;
//第j列一开始的大小为3的子块
r3[0][j]=r2[0][j]+src[(dim<<1)+j].red;
g3[0][j]=g2[0][j]+src[(dim<<1)+j].green;
b3[0][j]=b2[0][j]+src[(dim<<1)+j].blue;
//其他子块
for(i=1; i<dim-2; i++) {
r3[i][j]=r3[i-1][j]-src[(i-1)*dim+j].red+src[(i+2)*dim+j].red;
g3[i][j]=g3[i-1][j]-src[(i-1)*dim+j].green+src[(i+2)*dim+j].green;
b3[i][j]=b3[i-1][j]-src[(i-1)*dim+j].blue+src[(i+2)*dim+j].blue;
}
//最后一个长度为2的子块
r2[1][j]=r3[dim-3][j]-src[(dim-3)*dim+j].red;
g2[1][j]=g3[dim-3][j]-src[(dim-3)*dim+j].green;
b2[1][j]=b3[dim-3][j]-src[(dim-3)*dim+j].blue;
}
//处理左上角顶点
dst[0].red=(r2[0][0]+r2[0][1])/4;
dst[0].green=(g2[0][0]+g2[0][1])/4;
dst[0].blue=(b2[0][0]+b2[0][1])/4;
//处理第一行其他非右上角顶点
for(j=1; j<dim-1; j++) {
dst[j].red=(r2[0][j-1]+r2[0][j]+r2[0][j+1])/6;
dst[j].green=(g2[0][j-1]+g2[0][j]+g2[0][j+1])/6;
dst[j].blue=(b2[0][j-1]+b2[0][j]+b2[0][j+1])/6;
}
//右上角顶点
dst[j].red=(r2[0][j-1]+r2[0][j])/4;
dst[j].green=(g2[0][j-1]+g2[0][j])/4;
dst[j].blue=(b2[0][j-1]+b2[0][j])/4;
//1至dim-2行
for(i=1; i<dim-1; i++) {
//每行的第一个像素点
dst[i*dim].red=(r3[i-1][0]+r3[i-1][1])/6;
dst[i*dim].green=(g3[i-1][0]+g3[i-1][1])/6;
dst[i*dim].blue=(b3[i-1][0]+b3[i-1][1])/6;
//每行第二个至第dim-1个像素点
for(j=1; j<dim-1; j++) {
dst[i*dim+j].red=(r3[i-1][j-1]+r3[i-1][j]+r3[i-1][j+1])/9;
dst[i*dim+j].green=(g3[i-1][j-1]+g3[i-1][j]+g3[i-1][j+1])/9;
dst[i*dim+j].blue=(b3[i-1][j-1]+b3[i-1][j]+b3[i-1][j+1])/9;
}
//每行最后一个像素点
dst[i*dim+j].red=(r3[i-1][j-1]+r3[i-1][j])/6;
dst[i*dim+j].green=(g3[i-1][j-1]+g3[i-1][j])/6;
dst[i*dim+j].blue=(b3[i-1][j-1]+b3[i-1][j])/6;
}
//左下角像素点
dst[i*dim].red=(r2[1][0]+r2[1][1])/4;
dst[i*dim].green=(g2[1][0]+g2[1][1])/4;
dst[i*dim].blue=(b2[1][0]+b2[1][1])/4;
//最后一行非左下角、非右下角的像素点
for(j=1; j<dim-1; j++) {
dst[i*dim+j].red=(r2[1][j-1]+r2[1][j]+r2[1][j+1])/6;
dst[i*dim+j].green=(g2[1][j-1]+g2[1][j]+g2[1][j+1])/6;
dst[i*dim+j].blue=(b2[1][j-1]+b2[1][j]+b2[1][j+1])/6;
}
//右下角像素点
dst[i*dim+j].red=(r2[1][j-1]+r2[1][j])/4;
dst[i*dim+j].green=(g2[1][j-1]+g2[1][j])/4;
dst[i*dim+j].blue=(b2[1][j-1]+b2[1][j])/4;
版本III