Windows和Linux系统上的矢量运算:指令级并行计算SIMD(SSE/VAX)

news2026/2/11 17:25:44

注：本文的SIMD，指的是CPU指令架构中的相关概念。不涉及GPU端的算力机制。

基本概念

SIMD，Single Instruction/Multiple Data，即单指流令多数据流，例如一个乘法指令，可以并行的计算8个浮点数的乘法。

SIMD(Single Instruction/Multiple Data，即)是目前通用的CPU端的指令级并行计算机制，也叫做矢量运算，这些SIMD包括SSE和AVX。

通过代码直观的简介SIMD机制

用下面两份代码做个基本说明

非矢量运算的代码

void mul4_scalar( float* ptr )
{
    for( int i = 0; i < 4; i++ )
    {
        const float f = ptr[ i ];
        ptr[ i ] = f * f;
    }
}

矢量运算的代码

void mul4_vectorized( float* ptr )
{
    __m128 f = _mm_loadu_ps( ptr );
    f = _mm_mul_ps( f, f );
    _mm_storeu_ps( ptr, f );
}
// __m128 就是sse simd 对象
// _mm_mul_ps 和 _mm_storeu_ps 就是对应的乘法和赋值矢量运算指令(函数)

上述两份代码源自于: http://const.me/articles/simd/simd.pdf

不难看出，在密集计算中SIMD程序的效能肯定比常规程序高很多。(这里就不去和异构计算架构下的机制做比较了)

相关指令在源代码中的位置

Linux系统GCC

gcc-master\gcc\config\i386\immintrin.h头文件中包含了各种simd指令的相关头文件

SSE, __m128, 这类SSE指令位置: gcc-master\gcc\config\i386\xmmintrin.h

/* The Intel API is flexible enough that we must allow aliasing with other
   vector types, and their scalar components.  */
typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));

SSE2, __v4si, __128d定义在 gcc-master\gcc\config\i386\emmintrin.h中

typedef int __v4si __attribute__ ((__vector_size__ (16)));

typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));

MMX指令 __m64定义在 gcc-master\gcc\config\i386\mmintrin.h中

typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
typedef int __m32 __attribute__ ((__vector_size__ (4), __may_alias__));
typedef short __m16 __attribute__ ((__vector_size__ (2), __may_alias__));

AVX指令, _v4df, __v8si， __m256定义在gcc-master\gcc\config\i386\avxintrin.h中

typedef double __v4df __attribute__ ((__vector_size__ (32)));

typedef int __v8si __attribute__ ((__vector_size__ (32)));

typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));

typedef float __m256 __attribute__ ((__vector_size__ (32),
             __may_alias__));

linux系统中矢量运算 __attribute__ vector_size 定义的说明：

typedef int v4si __attribute__ ((vector_size (16)));

The int type specifies the base type, while the attribute specifies the vector size for the variable, measured in bytes. For example, the declaration above causes the compiler to set the mode for the v4si type to be 16 bytes wide and divided into int sized units. For a 32-bit int this means a vector of 4 units of 4 bytes, and the corresponding mode of foo will be V4SI.

相关细节请见: Vector Extensions (Using the GNU Compiler Collection (GCC))

Windows系统MSVC

头文件 Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.33.31629\include\intrin.h，也已经包含了相关指令函数的头文件

__m128d, __m128i定义在 Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.33.31629\include\emmintrin.h中

typedef union __declspec(intrin_type) __declspec(align(16)) __m128i {
    __int8              m128i_i8[16];
    __int16             m128i_i16[8];
    __int32             m128i_i32[4];
    __int64             m128i_i64[2];
    unsigned __int8     m128i_u8[16];
    unsigned __int16    m128i_u16[8];
    unsigned __int32    m128i_u32[4];
    unsigned __int64    m128i_u64[2];
} __m128i;

typedef struct __declspec(intrin_type) __declspec(align(16)) __m128d {
    double              m128d_f64[2];
} __m128d;

__m128d, __m128i定义在 Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.33.31629\include\emmintrin.h

typedef union __declspec(intrin_type) __declspec(align(16)) __m128i {
    __int8              m128i_i8[16];
    __int16             m128i_i16[8];
    __int32             m128i_i32[4];
    __int64             m128i_i64[2];
    unsigned __int8     m128i_u8[16];
    unsigned __int16    m128i_u16[8];
    unsigned __int32    m128i_u32[4];
    unsigned __int64    m128i_u64[2];
} __m128i;

typedef struct __declspec(intrin_type) __declspec(align(16)) __m128d {
    double              m128d_f64[2];
} __m128d;

AVX, __m256d, __m256, __m256i定义在 Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.33.31629\include\immintrin.h中

/*
 * Intel(R) AVX compiler intrinsic functions.
 */
typedef union __declspec(intrin_type) __declspec(align(32)) __m256 {
    float m256_f32[8];
} __m256;

typedef struct __declspec(intrin_type) __declspec(align(32)) __m256d {
    double m256d_f64[4];
} __m256d;

typedef union  __declspec(intrin_type) __declspec(align(32)) __m256i {
    __int8              m256i_i8[32];
    __int16             m256i_i16[16];
    __int32             m256i_i32[8];
    __int64             m256i_i64[4];
    unsigned __int8     m256i_u8[32];
    unsigned __int16    m256i_u16[16];
    unsigned __int32    m256i_u32[8];
    unsigned __int64    m256i_u64[4];
} __m256i;

两个系统的定义有区别，所以跨平台应用这些SIMD功能需要主要一些细节。

检查Linux系统或者Windows系统对SSE和AVX的支持情况

注：下面的代码本人常用在在Linux Centos7和Windows 10上

检测代码如下：

#include <iostream>
#ifdef _MSC_VER
# include <intrin.h>
void __cpuidSpec(int p0[4], int p1)
{
    __cpuid(p0, p1);
}
unsigned __int64 __cdecl _xgetbvSpec(unsigned int p)
{
    return _xgetbv(p);
}
#endif

#ifdef __GNUC__
void __cpuidSpec(int* cpuinfo, int info)
{
    __asm__ __volatile__(
        "xchg %%ebx, %%edi;"
        "cpuid;"
        "xchg %%ebx, %%edi;"
        : "=a"(cpuinfo[0]), "=D"(cpuinfo[1]), "=c"(cpuinfo[2]), "=d"(cpuinfo[3])
        : "0"(info));
}

unsigned long long _xgetbvSpec(unsigned int index)
{
    unsigned int eax, edx;
    __asm__ __volatile__(
        "xgetbv;"
        : "=a"(eax), "=d"(edx)
        : "c"(index));
    return ((unsigned long long)edx << 32) | eax;
}
#include <immintrin.h>
#endif

namespace sseavx
{
void sseavxCheck()
{
    bool sseSupportted    = false;
    bool sse2Supportted   = false;
    bool sse3Supportted   = false;
    bool ssse3Supportted  = false;
    bool sse4_1Supportted = false;
    bool sse4_2Supportted = false;
    bool sse4aSupportted  = false;
    bool sse5Supportted   = false;
    bool avxSupportted    = false;

    int cpuinfo[4];
    __cpuidSpec(cpuinfo, 1);

    // Check SSE, SSE2, SSE3, SSSE3, SSE4.1, and SSE4.2 support
    sseSupportted    = cpuinfo[3] & (1 << 25) || false;
    sse2Supportted   = cpuinfo[3] & (1 << 26) || false;
    sse3Supportted   = cpuinfo[2] & (1 << 0) || false;
    ssse3Supportted  = cpuinfo[2] & (1 << 9) || false;
    sse4_1Supportted = cpuinfo[2] & (1 << 19) || false;
    sse4_2Supportted = cpuinfo[2] & (1 << 20) || false;

    avxSupportted         = cpuinfo[2] & (1 << 28) || false;
    bool osxsaveSupported = cpuinfo[2] & (1 << 27) || false;
    if (osxsaveSupported && avxSupportted)
    {
        // _XCR_XFEATURE_ENABLED_MASK = 0
        unsigned long long xcrFeatureMask = _xgetbvSpec(0);
        avxSupportted                     = (xcrFeatureMask & 0x6) == 0x6;
    }

    // ----------------------------------------------------------------------

    // Check SSE4a and SSE5 support

    // Get the number of valid extended IDs
    __cpuidSpec(cpuinfo, 0x80000000);
    int numExtendedIds = cpuinfo[0];
    if (numExtendedIds >= 0x80000001)
    {
        __cpuidSpec(cpuinfo, 0x80000001);
        sse4aSupportted = cpuinfo[2] & (1 << 6) || false;
        sse5Supportted  = cpuinfo[2] & (1 << 11) || false;
    }

    // ----------------------------------------------------------------------
    std::boolalpha(std::cout);
    std::cout << "   Support SSE:   " << sseSupportted << std::endl;
    std::cout << "  Support SSE2:   " << sse2Supportted << std::endl;
    std::cout << "  Support SSE3:   " << sse3Supportted << std::endl;
    std::cout << "Support SSE4.1:   " << sse4_1Supportted << std::endl;
    std::cout << "Support SSE4.2:   " << sse4_2Supportted << std::endl;
    std::cout << " Support SSE4a:   " << sse4aSupportted << std::endl;
    std::cout << "  Support SSE5:   " << sse5Supportted << std::endl;
    std::cout << "   Support AVX:   " << avxSupportted << std::endl;
}
}

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.coloradmin.cn/o/856784.html

如若内容造成侵权/违法违规/事实不符，请联系多彩编程网进行投诉反馈，一经查实，立即删除！