基于opencv 3.1.0源代码
sources\modules\imgproc\src\imgwarp.cpp
void cv::warpAffine( InputArray _src, OutputArray _dst,
InputArray _M0, Size dsize,
int flags, int borderType, const Scalar& borderValue )
{
...
if( !(flags & WARP_INVERSE_MAP) )
{
//变换矩阵求逆
double D = M[0]*M[4] - M[1]*M[3];
D = D != 0 ? 1./D : 0;
double A11 = M[4]*D, A22=M[0]*D;
M[0] = A11; M[1] *= -D;
M[3] *= -D; M[4] = A22;
double b1 = -M[0]*M[2] - M[1]*M[5];
double b2 = -M[3]*M[2] - M[4]*M[5];
M[2] = b1; M[5] = b2;
}
...
//优先采用IPP加速
//省略
//...
//没有IPP库或处理失败时,才会继续运行下面的
for( x = 0; x < dst.cols; x++ )
{
adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
}
Range range(0, dst.rows);
//仿射变换类WarpAffineInvoker
WarpAffineInvoker invoker(src, dst, interpolation, borderType,
borderValue, adelta, bdelta, M);
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
}
//类WarpAffineInvoker实现
class WarpAffineInvoker :
public ParallelLoopBody
{
public:
WarpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
const Scalar &_borderValue, int *_adelta, int *_bdelta, double *_M) :
ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
M(_M)
{
}
//核心代码就在这里,直接上CPU指令集来加速的,经过优化的代码会比较难读
virtual void operator() (const Range& range) const
{
...
int bh0 = std::min(BLOCK_SZ/2, dst.rows);
int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);
for( y = range.start; y < range.end; y += bh0 )
{
for( x = 0; x < dst.cols; x += bw0 )
{
int bw = std::min( bw0, dst.cols - x);
int bh = std::min( bh0, range.end - y);
//根据目标坐标(变换后)和逆矩阵求出源坐标
...
Mat _XY(bh, bw, CV_16SC2, XY), matA; //源坐标矩阵
Mat dpart(dst, Rect(x, y, bw, bh)); //目标矩阵
...
//调用remap函数
if( interpolation == INTER_NEAREST )
remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
else
{
Mat _matA(bh, bw, CV_16U, A);
remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
}
}
}
}
...
}
...
//remap函数实现
void cv::remap( InputArray _src, OutputArray _dst,
InputArray _map1, InputArray _map2,
int interpolation, int borderType, const Scalar& borderValue )
{
//不同的插值算子
static RemapNNFunc nn_tab[] =
{
remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
};
static RemapFunc linear_tab[] =
{
remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
remapBilinear<Cast<float, float>, RemapNoVec, float>,
remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
};
static RemapFunc cubic_tab[] =
{
remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
remapBicubic<Cast<float, ushort>, float, 1>,
remapBicubic<Cast<float, short>, float, 1>, 0,
remapBicubic<Cast<float, float>, float, 1>,
remapBicubic<Cast<double, double>, float, 1>, 0
};
static RemapFunc lanczos4_tab[] =
{
remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
remapLanczos4<Cast<float, ushort>, float, 1>,
remapLanczos4<Cast<float, short>, float, 1>, 0,
remapLanczos4<Cast<float, float>, float, 1>,
remapLanczos4<Cast<double, double>, float, 1>, 0
};
...
if( interpolation == INTER_AREA )
interpolation = INTER_LINEAR;
//优先采用IPP加速
//省略
//...
//没有IPP库或处理失败时,才会继续运行下面的
RemapNNFunc nnfunc = 0;
RemapFunc ifunc = 0;
const void* ctab = 0;
bool fixpt = depth == CV_8U;
bool planar_input = false;
if( interpolation == INTER_NEAREST )
{
nnfunc = nn_tab[depth];
CV_Assert( nnfunc != 0 );
}
else
{
if( interpolation == INTER_LINEAR )
ifunc = linear_tab[depth];
else if( interpolation == INTER_CUBIC )
ifunc = cubic_tab[depth];
else if( interpolation == INTER_LANCZOS4 )
ifunc = lanczos4_tab[depth];
else
CV_Error( CV_StsBadArg, "Unknown interpolation method" );
CV_Assert( ifunc != 0 );
ctab = initInterTab2D( interpolation, fixpt );
}
const Mat *m1 = &map1, *m2 = &map2;
if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || map2.empty())) ||
(map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || map1.empty())) )
{
if( map1.type() != CV_16SC2 )
std::swap(m1, m2);
}
else
{
CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && map2.empty()) ||
(map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
planar_input = map1.channels() == 1;
}
//插值处理在RemapInvoker类实现
RemapInvoker invoker(src, dst, m1, m2,
borderType, borderValue, planar_input, nnfunc, ifunc,
ctab);
parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16));
}
//类RemapInvoker实现
class RemapInvoker :
public ParallelLoopBody
{
public:
RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1,
const Mat *_m2, int _borderType, const Scalar &_borderValue,
int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2),
borderType(_borderType), borderValue(_borderValue),
planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
{
}
//直接上CPU指令集来加速的,经过优化的代码会比较难读
virtual void operator() (const Range& range) const
{
int x, y, x1, y1;
const int buf_size = 1 << 14;
int brows0 = std::min(128, dst->rows), map_depth = m1->depth();
int bcols0 = std::min(buf_size/brows0, dst->cols);
brows0 = std::min(buf_size/bcols0, dst->rows);
#if CV_SSE2
bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif
Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
if( !nnfunc )
_bufa.create(brows0, bcols0, CV_16UC1);
for( y = range.start; y < range.end; y += brows0 )
{
for( x = 0; x < dst->cols; x += bcols0 )
{
int brows = std::min(brows0, range.end - y);
int bcols = std::min(bcols0, dst->cols - x);
Mat dpart(*dst, Rect(x, y, bcols, brows));
Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
//最近邻插值
if( nnfunc )
{
if( m1->type() == CV_16SC2 && m2->empty() ) // the data is already in the right format
bufxy = (*m1)(Rect(x, y, bcols, brows));
else if( map_depth != CV_32F )
{
for( y1 = 0; y1 < brows; y1++ )
{
short* XY = bufxy.ptr<short>(y1);
const short* sXY = m1->ptr<short>(y+y1) + x*2;
const ushort* sA = m2->ptr<ushort>(y+y1) + x;
for( x1 = 0; x1 < bcols; x1++ )
{
int a = sA[x1] & (INTER_TAB_SIZE2-1);
XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0];
XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1];
}
}
}
else if( !planar_input )
(*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
else
{
for( y1 = 0; y1 < brows; y1++ )
{
short* XY = bufxy.ptr<short>(y1);
const float* sX = m1->ptr<float>(y+y1) + x;
const float* sY = m2->ptr<float>(y+y1) + x;
x1 = 0;
#if CV_SSE2
if( useSIMD )
{
for( ; x1 <= bcols - 8; x1 += 8 )
{
__m128 fx0 = _mm_loadu_ps(sX + x1);
__m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
__m128 fy0 = _mm_loadu_ps(sY + x1);
__m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
__m128i ix0 = _mm_cvtps_epi32(fx0);
__m128i ix1 = _mm_cvtps_epi32(fx1);
__m128i iy0 = _mm_cvtps_epi32(fy0);
__m128i iy1 = _mm_cvtps_epi32(fy1);
ix0 = _mm_packs_epi32(ix0, ix1);
iy0 = _mm_packs_epi32(iy0, iy1);
ix1 = _mm_unpacklo_epi16(ix0, iy0);
iy1 = _mm_unpackhi_epi16(ix0, iy0);
_mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
_mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
}
}
#endif
for( ; x1 < bcols; x1++ )
{
XY[x1*2] = saturate_cast<short>(sX[x1]);
XY[x1*2+1] = saturate_cast<short>(sY[x1]);
}
}
}
nnfunc( *src, dpart, bufxy, borderType, borderValue );
continue;
}
//其它插值算法
Mat bufa(_bufa, Rect(0, 0, bcols, brows));
for( y1 = 0; y1 < brows; y1++ )
{
short* XY = bufxy.ptr<short>(y1);
ushort* A = bufa.ptr<ushort>(y1);
if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
{
bufxy = (*m1)(Rect(x, y, bcols, brows));
const ushort* sA = m2->ptr<ushort>(y+y1) + x;
x1 = 0;
#if CV_NEON
...
#elif CV_SSE2
__m128i v_scale = _mm_set1_epi16(INTER_TAB_SIZE2-1);
for ( ; x1 <= bcols - 8; x1 += 8)
_mm_storeu_si128((__m128i *)(A + x1), _mm_and_si128(_mm_loadu_si128((const __m128i *)(sA + x1)), v_scale));
#endif
for( ; x1 < bcols; x1++ )
A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
}
else if( planar_input )
{
const float* sX = m1->ptr<float>(y+y1) + x;
const float* sY = m2->ptr<float>(y+y1) + x;
x1 = 0;
#if CV_SSE2
if( useSIMD )
{
__m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE);
__m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
for( ; x1 <= bcols - 8; x1 += 8 )
{
__m128 fx0 = _mm_loadu_ps(sX + x1);
__m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
__m128 fy0 = _mm_loadu_ps(sY + x1);
__m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
__m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale));
__m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale));
__m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale));
__m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale));
__m128i mx0 = _mm_and_si128(ix0, mask);
__m128i mx1 = _mm_and_si128(ix1, mask);
__m128i my0 = _mm_and_si128(iy0, mask);
__m128i my1 = _mm_and_si128(iy1, mask);
mx0 = _mm_packs_epi32(mx0, mx1);
my0 = _mm_packs_epi32(my0, my1);
my0 = _mm_slli_epi16(my0, INTER_BITS);
mx0 = _mm_or_si128(mx0, my0);
_mm_storeu_si128((__m128i*)(A + x1), mx0);
ix0 = _mm_srai_epi32(ix0, INTER_BITS);
ix1 = _mm_srai_epi32(ix1, INTER_BITS);
iy0 = _mm_srai_epi32(iy0, INTER_BITS);
iy1 = _mm_srai_epi32(iy1, INTER_BITS);
ix0 = _mm_packs_epi32(ix0, ix1);
iy0 = _mm_packs_epi32(iy0, iy1);
ix1 = _mm_unpacklo_epi16(ix0, iy0);
iy1 = _mm_unpackhi_epi16(ix0, iy0);
_mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
_mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
}
}
#elif CV_NEON
...
#endif
for( ; x1 < bcols; x1++ )
{
int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
int sy = cvRound(sY[x1]*INTER_TAB_SIZE);
int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
A[x1] = (ushort)v;
}
}
else
{
const float* sXY = m1->ptr<float>(y+y1) + x*2;
x1 = 0;
#if CV_NEON
...
#endif
for( x1 = 0; x1 < bcols; x1++ )
{
int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
A[x1] = (ushort)v;
}
}
}
ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue);
}
}
}
...
};
//以下只截取最近邻插值算子的代码
template<typename T>
static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
int borderType, const Scalar& _borderValue )
{
Size ssize = _src.size(), dsize = _dst.size();
int cn = _src.channels();
const T* S0 = _src.ptr<T>();
size_t sstep = _src.step/sizeof(S0[0]);
Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
saturate_cast<T>(_borderValue[1]),
saturate_cast<T>(_borderValue[2]),
saturate_cast<T>(_borderValue[3]));
int dx, dy;
unsigned width1 = ssize.width, height1 = ssize.height;
if( _dst.isContinuous() && _xy.isContinuous() )
{
dsize.width *= dsize.height;
dsize.height = 1;
}
for( dy = 0; dy < dsize.height; dy++ )
{
T* D = _dst.ptr<T>(dy);
const short* XY = _xy.ptr<short>(dy);
if( cn == 1 ) //单通道
{
for( dx = 0; dx < dsize.width; dx++ )
{
int sx = XY[dx*2], sy = XY[dx*2+1];
if( (unsigned)sx < width1 && (unsigned)sy < height1 )
D[dx] = S0[sy*sstep + sx];
else //越界
{
if( borderType == BORDER_REPLICATE )
{
sx = clip(sx, 0, ssize.width);
sy = clip(sy, 0, ssize.height);
D[dx] = S0[sy*sstep + sx];
}
else if( borderType == BORDER_CONSTANT )
D[dx] = cval[0];
else if( borderType != BORDER_TRANSPARENT )
{
//边界处理
sx = borderInterpolate(sx, ssize.width, borderType);
sy = borderInterpolate(sy, ssize.height, borderType);
D[dx] = S0[sy*sstep + sx];
}
}
}
}
else //多通道
{
for( dx = 0; dx < dsize.width; dx++, D += cn )
{
int sx = XY[dx*2], sy = XY[dx*2+1], k;
const T *S;
if( (unsigned)sx < width1 && (unsigned)sy < height1 )
{
if( cn == 3 )
{
S = S0 + sy*sstep + sx*3;
D[0] = S[0], D[1] = S[1], D[2] = S[2];
}
else if( cn == 4 )
{
S = S0 + sy*sstep + sx*4;
D[0] = S[0], D[1] = S[1], D[2] = S[2], D[3] = S[3];
}
else
{
S = S0 + sy*sstep + sx*cn;
for( k = 0; k < cn; k++ )
D[k] = S[k];
}
}
else if( borderType != BORDER_TRANSPARENT )
{
if( borderType == BORDER_REPLICATE )
{
sx = clip(sx, 0, ssize.width);
sy = clip(sy, 0, ssize.height);
S = S0 + sy*sstep + sx*cn;
}
else if( borderType == BORDER_CONSTANT )
S = &cval[0];
else
{
sx = borderInterpolate(sx, ssize.width, borderType);
sy = borderInterpolate(sy, ssize.height, borderType);
S = S0 + sy*sstep + sx*cn;
}
for( k = 0; k < cn; k++ )
D[k] = S[k];
}
}
}
}
}
边界处理的代码在sources\modules\core\src\copy.cpp
/*
Various border types, image boundaries are denoted with '|'
* BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
* BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
* BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
* BORDER_WRAP: cdefgh|abcdefgh|abcdefg
* BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii with some specified 'i'
*/
int cv::borderInterpolate( int p, int len, int borderType )
{
if( (unsigned)p < (unsigned)len )
;
else if( borderType == BORDER_REPLICATE )
p = p < 0 ? 0 : len - 1;
else if( borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101 )
{
int delta = borderType == BORDER_REFLECT_101;
if( len == 1 )
return 0;
do
{
if( p < 0 )
p = -p - 1 + delta;
else
p = len - 1 - (p - len) - delta;
}
while( (unsigned)p >= (unsigned)len );
}
else if( borderType == BORDER_WRAP )
{
CV_Assert(len > 0);
if( p < 0 )
p -= ((p-len+1)/len)*len;
if( p >= len )
p %= len;
}
else if( borderType == BORDER_CONSTANT )
p = -1;
else
CV_Error( CV_StsBadArg, "Unknown/unsupported border type" );
return p;
}