<C++> SSE指令集

news2024/10/6 8:37:01

SSE指令集

include库

#include <mmintrin.h>  //MMX
#include <xmmintrin.h> //SSE(include mmintrin.h)
#include <emmintrin.h> //SSE2(include xmmintrin.h)
#include <pmmintrin.h> //SSE3(include emmintrin.h)
#include <tmmintrin.h> //SSSE3(include pmmintrin.h)
#include <smmintrin.h> //SSE4.1(include tmmintrin.h)
#include <nmmintrin.h> //SSE4.2(include smmintrin.h)
#include <wmmintrin.h> //AES(include nmmintrin.h)
#include <immintrin.h> //AVX(include wmmintrin.h)
#include <intrin.h>    //所有版本(include immintrin.h)

基本操作

  1. 使用SSE专门的LOAD指令将数据从内存加载一个向量到寄存器;
  2. 使用SSE专门的OP指令对两个向量进行某种计算;
  3. 使用SSE专门的STORE指令把计算结果从寄存器写回到内存;

数据类型

  • __m128表示128bit的单精度浮点数
typedef union __declspec(intrin_type) __declspec(align(16)) __m128 {
     float               m128_f32[4];
     unsigned __int64    m128_u64[2];
     __int8              m128_i8[16];
     __int16             m128_i16[8];
     __int32             m128_i32[4];
     __int64             m128_i64[2];
     unsigned __int8     m128_u8[16];
     unsigned __int16    m128_u16[8];
     unsigned __int32    m128_u32[4];
 } __m128;
  • __m128i表示128bit的整数型
typedef union __declspec(intrin_type) __declspec(align(16)) __m128i {
    __int8              m128i_i8[16];
    __int16             m128i_i16[8];
    __int32             m128i_i32[4];
    __int64             m128i_i64[2];
    unsigned __int8     m128i_u8[16];
    unsigned __int16    m128i_u16[8];
    unsigned __int32    m128i_u32[4];
    unsigned __int64    m128i_u64[2];
} __m128i;
  • __128d表示128bit的双精度浮点数
typedef struct __declspec(intrin_type) __declspec(align(16)) __m128d {
    double              m128d_f64[2];
} __m128d;

指令函数命名

SSE指令的函数从命名上,主要分成三部分,以_mm_loadu_pd为例:

  1. 第一部分均以_mm开头,表示属于SSE指令集,_mm256或_mm512是AVX或AVX-512指令集的Intrinsic函数前缀;
  2. 第二部分表明操作类型,比如load,add,store等。但部分指令后面跟有[l|h|u|r]等字母,比如u表示mem_addr不需要内存对齐,r表示反向读取等;
  3. 第三部分为操作的对象名及数据类型:
    _ps:packed操作所有的单精度浮点数;
    _pd:packed操作所有的双精度浮点数;
    _pixx:(xx为长度,可以是8,16,32,64)packed操作所有的xx位有符号整数,使用的寄存器长度为64位;
    _epixx:(xx为长度)packed操作所有的xx位的有符号整数,使用的寄存器长度为128位;
    _epuxx: packed操作所有的xx位的无符号整数;
    _ss:scalar操作第一个单精度浮点数;
    p表示packed即对128bits的数据全部执行相同的操作,s表示scalar,只对128bits中的第一组数据执行操作,如下图所示。
    在这里插入图片描述

1、load加载

__m128i _mm_lddqu_si128 (__m128i const* mem_addr)
__m128d _mm_load_pd (double const* mem_addr)
__m128d _mm_load_pd1 (double const* mem_addr)
__m128 _mm_load_ps (float const* mem_addr)
__m128 _mm_load_ps1 (float const* mem_addr)
__m128d _mm_load_sd (double const* mem_addr)
__m128i _mm_load_si128 (__m128i const* mem_addr)
__m128 _mm_load_ss (float const* mem_addr)
__m128d _mm_load1_pd (double const* mem_addr)
__m128 _mm_load1_ps (float const* mem_addr)
__m128d _mm_loaddup_pd (double const* mem_addr)
__m128d _mm_loadh_pd (__m128d a, double const* mem_addr)
__m128 _mm_loadh_pi (__m128 a, __m64 const* mem_addr)
__m128i _mm_loadl_epi64 (__m128i const* mem_addr)
__m128d _mm_loadl_pd (__m128d a, double const* mem_addr)
__m128 _mm_loadl_pi (__m128 a, __m64 const* mem_addr)
__m128d _mm_loadr_pd (double const* mem_addr)
__m128 _mm_loadr_ps (float const* mem_addr)
__m128d _mm_loadu_pd (double const* mem_addr)
__m128 _mm_loadu_ps (float const* mem_addr)
__m128i _mm_loadu_si128 (__m128i const* mem_addr)
__m128i _mm_loadu_si16 (void const* mem_addr)
__m128i _mm_loadu_si32 (void const* mem_addr)
__m128i _mm_loadu_si64 (void const* mem_addr)

2、OP操作

Arithmetic算术

__m128i _mm_add_epi16 (__m128i a, __m128i b)
__m128i _mm_add_epi32 (__m128i a, __m128i b)
__m128i _mm_add_epi64 (__m128i a, __m128i b)
__m128i _mm_add_epi8 (__m128i a, __m128i b)
__m128d _mm_add_pd (__m128d a, __m128d b)
__m128 _mm_add_ps (__m128 a, __m128 b)
__m128d _mm_add_sd (__m128d a, __m128d b)
__m64 _mm_add_si64 (__m64 a, __m64 b)
__m128 _mm_add_ss (__m128 a, __m128 b)
__m128i _mm_adds_epi16 (__m128i a, __m128i b)
__m128i _mm_adds_epi8 (__m128i a, __m128i b)
__m128i _mm_adds_epu16 (__m128i a, __m128i b)
__m128i _mm_adds_epu8 (__m128i a, __m128i b)
__m128d _mm_addsub_pd (__m128d a, __m128d b)
__m128 _mm_addsub_ps (__m128 a, __m128 b)
__m128d _mm_div_pd (__m128d a, __m128d b)
__m128 _mm_div_ps (__m128 a, __m128 b)
__m128d _mm_div_sd (__m128d a, __m128d b)
__m128 _mm_div_ss (__m128 a, __m128 b)
__m128d _mm_dp_pd (__m128d a, __m128d b, const int imm8)
__m128 _mm_dp_ps (__m128 a, __m128 b, const int imm8)
__m128i _mm_hadd_epi16 (__m128i a, __m128i b)
__m128i _mm_hadd_epi32 (__m128i a, __m128i b)
__m128d _mm_hadd_pd (__m128d a, __m128d b)
__m64 _mm_hadd_pi16 (__m64 a, __m64 b)
__m64 _mm_hadd_pi32 (__m64 a, __m64 b)
__m128 _mm_hadd_ps (__m128 a, __m128 b)
__m128i _mm_hadds_epi16 (__m128i a, __m128i b)
__m64 _mm_hadds_pi16 (__m64 a, __m64 b)
__m128i _mm_hsub_epi16 (__m128i a, __m128i b)
__m128i _mm_hsub_epi32 (__m128i a, __m128i b)
__m128d _mm_hsub_pd (__m128d a, __m128d b)
__m64 _mm_hsub_pi16 (__m64 a, __m64 b)
__m64 _mm_hsub_pi32 (__m64 a, __m64 b)
__m128 _mm_hsub_ps (__m128 a, __m128 b)
__m128i _mm_hsubs_epi16 (__m128i a, __m128i b)
__m64 _mm_hsubs_pi16 (__m64 a, __m64 b)
__m128i _mm_madd_epi16 (__m128i a, __m128i b)
__m128i _mm_maddubs_epi16 (__m128i a, __m128i b)
__m64 _mm_maddubs_pi16 (__m64 a, __m64 b)
__m128i _mm_mpsadbw_epu8 (__m128i a, __m128i b, const int imm8)
__m128i _mm_mul_epi32 (__m128i a, __m128i b)
__m128i _mm_mul_epu32 (__m128i a, __m128i b)
__m128d _mm_mul_pd (__m128d a, __m128d b)
__m128 _mm_mul_ps (__m128 a, __m128 b)
__m128d _mm_mul_sd (__m128d a, __m128d b)
__m128 _mm_mul_ss (__m128 a, __m128 b)
__m64 _mm_mul_su32 (__m64 a, __m64 b)
__m128i _mm_mulhi_epi16 (__m128i a, __m128i b)
__m128i _mm_mulhi_epu16 (__m128i a, __m128i b)
__m64 _mm_mulhi_pu16 (__m64 a, __m64 b)
__m128i _mm_mulhrs_epi16 (__m128i a, __m128i b)
__m64 _mm_mulhrs_pi16 (__m64 a, __m64 b)
__m128i _mm_mullo_epi16 (__m128i a, __m128i b)
__m128i _mm_mullo_epi32 (__m128i a, __m128i b)
__m64 _m_pmulhuw (__m64 a, __m64 b)
__m64 _m_psadbw (__m64 a, __m64 b)
__m128i _mm_sad_epu8 (__m128i a, __m128i b)
__m64 _mm_sad_pu8 (__m64 a, __m64 b)
__m128i _mm_sign_epi16 (__m128i a, __m128i b)
__m128i _mm_sign_epi32 (__m128i a, __m128i b)
__m128i _mm_sign_epi8 (__m128i a, __m128i b)
__m64 _mm_sign_pi16 (__m64 a, __m64 b)
__m64 _mm_sign_pi32 (__m64 a, __m64 b)
__m64 _mm_sign_pi8 (__m64 a, __m64 b)
__m128i _mm_sub_epi16 (__m128i a, __m128i b)
__m128i _mm_sub_epi32 (__m128i a, __m128i b)
__m128i _mm_sub_epi64 (__m128i a, __m128i b)
__m128i _mm_sub_epi8 (__m128i a, __m128i b)
__m128d _mm_sub_pd (__m128d a, __m128d b)
__m128 _mm_sub_ps (__m128 a, __m128 b)
__m128d _mm_sub_sd (__m128d a, __m128d b)
__m64 _mm_sub_si64 (__m64 a, __m64 b)
__m128 _mm_sub_ss (__m128 a, __m128 b)
__m128i _mm_subs_epi16 (__m128i a, __m128i b)
__m128i _mm_subs_epi8 (__m128i a, __m128i b)
__m128i _mm_subs_epu16 (__m128i a, __m128i b)
__m128i _mm_subs_epu8 (__m128i a, __m128i b)

Compare比较

__m128i _mm_cmpeq_epi16 (__m128i a, __m128i b)
__m128i _mm_cmpeq_epi32 (__m128i a, __m128i b)
__m128i _mm_cmpeq_epi64 (__m128i a, __m128i b)
__m128i _mm_cmpeq_epi8 (__m128i a, __m128i b)
__m128d _mm_cmpeq_pd (__m128d a, __m128d b)
__m128 _mm_cmpeq_ps (__m128 a, __m128 b)
__m128d _mm_cmpeq_sd (__m128d a, __m128d b)
__m128 _mm_cmpeq_ss (__m128 a, __m128 b)
__m128d _mm_cmpge_pd (__m128d a, __m128d b)
__m128 _mm_cmpge_ps (__m128 a, __m128 b)
__m128d _mm_cmpge_sd (__m128d a, __m128d b)
__m128 _mm_cmpge_ss (__m128 a, __m128 b)
__m128i _mm_cmpgt_epi16 (__m128i a, __m128i b)
__m128i _mm_cmpgt_epi32 (__m128i a, __m128i b)
__m128i _mm_cmpgt_epi64 (__m128i a, __m128i b)
__m128i _mm_cmpgt_epi8 (__m128i a, __m128i b)
__m128d _mm_cmpgt_pd (__m128d a, __m128d b)
__m128 _mm_cmpgt_ps (__m128 a, __m128 b)
__m128d _mm_cmpgt_sd (__m128d a, __m128d b)
__m128 _mm_cmpgt_ss (__m128 a, __m128 b)
__m128d _mm_cmple_pd (__m128d a, __m128d b)
__m128 _mm_cmple_ps (__m128 a, __m128 b)
__m128d _mm_cmple_sd (__m128d a, __m128d b)
__m128 _mm_cmple_ss (__m128 a, __m128 b)
__m128i _mm_cmplt_epi16 (__m128i a, __m128i b)
__m128i _mm_cmplt_epi32 (__m128i a, __m128i b)
__m128i _mm_cmplt_epi8 (__m128i a, __m128i b)
__m128d _mm_cmplt_pd (__m128d a, __m128d b)
__m128 _mm_cmplt_ps (__m128 a, __m128 b)
__m128d _mm_cmplt_sd (__m128d a, __m128d b)
__m128 _mm_cmplt_ss (__m128 a, __m128 b)
__m128d _mm_cmpneq_pd (__m128d a, __m128d b)
__m128 _mm_cmpneq_ps (__m128 a, __m128 b)
__m128d _mm_cmpneq_sd (__m128d a, __m128d b)
__m128 _mm_cmpneq_ss (__m128 a, __m128 b)
__m128d _mm_cmpnge_pd (__m128d a, __m128d b)
__m128 _mm_cmpnge_ps (__m128 a, __m128 b)
__m128d _mm_cmpnge_sd (__m128d a, __m128d b)
__m128 _mm_cmpnge_ss (__m128 a, __m128 b)
__m128d _mm_cmpngt_pd (__m128d a, __m128d b)
__m128 _mm_cmpngt_ps (__m128 a, __m128 b)
__m128d _mm_cmpngt_sd (__m128d a, __m128d b)
__m128 _mm_cmpngt_ss (__m128 a, __m128 b)
__m128d _mm_cmpnle_pd (__m128d a, __m128d b)
__m128 _mm_cmpnle_ps (__m128 a, __m128 b)
__m128d _mm_cmpnle_sd (__m128d a, __m128d b)
__m128 _mm_cmpnle_ss (__m128 a, __m128 b)
__m128d _mm_cmpnlt_pd (__m128d a, __m128d b)
__m128 _mm_cmpnlt_ps (__m128 a, __m128 b)
__m128d _mm_cmpnlt_sd (__m128d a, __m128d b)
__m128 _mm_cmpnlt_ss (__m128 a, __m128 b)
__m128d _mm_cmpord_pd (__m128d a, __m128d b)
__m128 _mm_cmpord_ps (__m128 a, __m128 b)
__m128d _mm_cmpord_sd (__m128d a, __m128d b)
__m128 _mm_cmpord_ss (__m128 a, __m128 b)
__m128d _mm_cmpunord_pd (__m128d a, __m128d b)
__m128 _mm_cmpunord_ps (__m128 a, __m128 b)
__m128d _mm_cmpunord_sd (__m128d a, __m128d b)
__m128 _mm_cmpunord_ss (__m128 a, __m128 b)
int _mm_comieq_sd (__m128d a, __m128d b)
int _mm_comieq_ss (__m128 a, __m128 b)
int _mm_comige_sd (__m128d a, __m128d b)
int _mm_comige_ss (__m128 a, __m128 b)
int _mm_comigt_sd (__m128d a, __m128d b)
int _mm_comigt_ss (__m128 a, __m128 b)
int _mm_comile_sd (__m128d a, __m128d b)
int _mm_comile_ss (__m128 a, __m128 b)
int _mm_comilt_sd (__m128d a, __m128d b)
int _mm_comilt_ss (__m128 a, __m128 b)
int _mm_comineq_sd (__m128d a, __m128d b)
int _mm_comineq_ss (__m128 a, __m128 b)
int _mm_ucomieq_sd (__m128d a, __m128d b)
int _mm_ucomieq_ss (__m128 a, __m128 b)
int _mm_ucomige_sd (__m128d a, __m128d b)
int _mm_ucomige_ss (__m128 a, __m128 b)
int _mm_ucomigt_sd (__m128d a, __m128d b)
int _mm_ucomigt_ss (__m128 a, __m128 b)
int _mm_ucomile_sd (__m128d a, __m128d b)
int _mm_ucomile_ss (__m128 a, __m128 b)
int _mm_ucomilt_sd (__m128d a, __m128d b)
int _mm_ucomilt_ss (__m128 a, __m128 b)
int _mm_ucomineq_sd (__m128d a, __m128d b)
int _mm_ucomineq_ss (__m128 a, __m128 b)

Convert转换

__m128 _mm_cvt_pi2ps (__m128 a, __m64 b)
__m64 _mm_cvt_ps2pi (__m128 a)
__m128 _mm_cvt_si2ss (__m128 a, int b)
int _mm_cvt_ss2si (__m128 a)
__m128i _mm_cvtepi16_epi32 (__m128i a)
__m128i _mm_cvtepi16_epi64 (__m128i a)
__m128i _mm_cvtepi32_epi64 (__m128i a)
__m128d _mm_cvtepi32_pd (__m128i a)
__m128 _mm_cvtepi32_ps (__m128i a)
__m128i _mm_cvtepi8_epi16 (__m128i a)
__m128i _mm_cvtepi8_epi32 (__m128i a)
__m128i _mm_cvtepi8_epi64 (__m128i a)
__m128i _mm_cvtepu16_epi32 (__m128i a)
__m128i _mm_cvtepu16_epi64 (__m128i a)
__m128i _mm_cvtepu32_epi64 (__m128i a)
__m128i _mm_cvtepu8_epi16 (__m128i a)
__m128i _mm_cvtepu8_epi32 (__m128i a)
__m128i _mm_cvtepu8_epi64 (__m128i a)
__m128i _mm_cvtpd_epi32 (__m128d a)
__m64 _mm_cvtpd_pi32 (__m128d a)
__m128 _mm_cvtpd_ps (__m128d a)
__m128 _mm_cvtpi16_ps (__m64 a)
__m128d _mm_cvtpi32_pd (__m64 a)
__m128 _mm_cvtpi32_ps (__m128 a, __m64 b)
__m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b)
__m128 _mm_cvtpi8_ps (__m64 a)
__m128i _mm_cvtps_epi32 (__m128 a)
__m128d _mm_cvtps_pd (__m128 a)
__m64 _mm_cvtps_pi16 (__m128 a)
__m64 _mm_cvtps_pi32 (__m128 a)
__m64 _mm_cvtps_pi8 (__m128 a)
__m128 _mm_cvtpu16_ps (__m64 a)
__m128 _mm_cvtpu8_ps (__m64 a)
double _mm_cvtsd_f64 (__m128d a)
int _mm_cvtsd_si32 (__m128d a)
__int64 _mm_cvtsd_si64 (__m128d a)
__int64 _mm_cvtsd_si64x (__m128d a)
__m128 _mm_cvtsd_ss (__m128 a, __m128d b)
int _mm_cvtsi128_si32 (__m128i a)
__int64 _mm_cvtsi128_si64 (__m128i a)
__int64 _mm_cvtsi128_si64x (__m128i a)
__m128d _mm_cvtsi32_sd (__m128d a, int b)
__m128i _mm_cvtsi32_si128 (int a)
__m128 _mm_cvtsi32_ss (__m128 a, int b)
__m128d _mm_cvtsi64_sd (__m128d a, __int64 b)
__m128i _mm_cvtsi64_si128 (__int64 a)
__m128 _mm_cvtsi64_ss (__m128 a, __int64 b)
__m128d _mm_cvtsi64x_sd (__m128d a, __int64 b)
__m128i _mm_cvtsi64x_si128 (__int64 a)
float _mm_cvtss_f32 (__m128 a)
__m128d _mm_cvtss_sd (__m128d a, __m128 b)
int _mm_cvtss_si32 (__m128 a)
__int64 _mm_cvtss_si64 (__m128 a)
__m64 _mm_cvtt_ps2pi (__m128 a)
int _mm_cvtt_ss2si (__m128 a)
__m128i _mm_cvttpd_epi32 (__m128d a)
__m64 _mm_cvttpd_pi32 (__m128d a)
__m128i _mm_cvttps_epi32 (__m128 a)
__m64 _mm_cvttps_pi32 (__m128 a)
int _mm_cvttsd_si32 (__m128d a)
__int64 _mm_cvttsd_si64 (__m128d a)
__int64 _mm_cvttsd_si64x (__m128d a)
int _mm_cvttss_si32 (__m128 a)
__int64 _mm_cvttss_si64 (__m128 a)
__m128i _mm_packus_epi32 (__m128i a, __m128i b)

Logical逻辑

__m128d _mm_and_pd (__m128d a, __m128d b)
__m128 _mm_and_ps (__m128 a, __m128 b)
__m128i _mm_and_si128 (__m128i a, __m128i b)
__m128d _mm_andnot_pd (__m128d a, __m128d b)
__m128 _mm_andnot_ps (__m128 a, __m128 b)
__m128i _mm_andnot_si128 (__m128i a, __m128i b)
__m128d _mm_or_pd (__m128d a, __m128d b)
__m128 _mm_or_ps (__m128 a, __m128 b)
__m128i _mm_or_si128 (__m128i a, __m128i b)
int _mm_test_all_ones (__m128i a)
int _mm_test_all_zeros (__m128i mask, __m128i a)
int _mm_test_mix_ones_zeros (__m128i mask, __m128i a)
int _mm_testc_si128 (__m128i a, __m128i b)
int _mm_testnzc_si128 (__m128i a, __m128i b)
int _mm_testz_si128 (__m128i a, __m128i b)
__m128d _mm_xor_pd (__m128d a, __m128d b)
__m128 _mm_xor_ps (__m128 a, __m128 b)
__m128i _mm_xor_si128 (__m128i a, __m128i b)

Set设置

__m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
__m128i _mm_set_epi32 (int e3, int e2, int e1, int e0)
__m128i _mm_set_epi64 (__m64 e1, __m64 e0)
__m128i _mm_set_epi64x (__int64 e1, __int64 e0)
__m128i _mm_set_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
__m128d _mm_set_pd (double e1, double e0)
__m128d _mm_set_pd1 (double a)
__m128 _mm_set_ps (float e3, float e2, float e1, float e0)
__m128 _mm_set_ps1 (float a)
__m128d _mm_set_sd (double a)
__m128 _mm_set_ss (float a)
__m128i _mm_set1_epi16 (short a)
__m128i _mm_set1_epi32 (int a)
__m128i _mm_set1_epi64 (__m64 a)
__m128i _mm_set1_epi64x (__int64 a)
__m128i _mm_set1_epi8 (char a)
__m128d _mm_set1_pd (double a)
__m128 _mm_set1_ps (float a)
__m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
__m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0)
__m128i _mm_setr_epi64 (__m64 e1, __m64 e0)
__m128i _mm_setr_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
__m128d _mm_setr_pd (double e1, double e0)
__m128 _mm_setr_ps (float e3, float e2, float e1, float e0)
__m128d _mm_setzero_pd (void)
__m128 _mm_setzero_ps (void)
__m128i _mm_setzero_si128 ()

3、Store存储

void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr)
void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_addr)
void _m_maskmovq (__m64 a, __m64 mask, char* mem_addr)
void _mm_store_pd (double* mem_addr, __m128d a)
void _mm_store_pd1 (double* mem_addr, __m128d a)
void _mm_store_ps (float* mem_addr, __m128 a)
void _mm_store_ps1 (float* mem_addr, __m128 a)
void _mm_store_sd (double* mem_addr, __m128d a)
void _mm_store_si128 (__m128i* mem_addr, __m128i a)
void _mm_store_ss (float* mem_addr, __m128 a)
void _mm_store1_pd (double* mem_addr, __m128d a)
void _mm_store1_ps (float* mem_addr, __m128 a)
void _mm_storeh_pd (double* mem_addr, __m128d a)
void _mm_storeh_pi (__m64* mem_addr, __m128 a)
void _mm_storel_epi64 (__m128i* mem_addr, __m128i a)
void _mm_storel_pd (double* mem_addr, __m128d a)
void _mm_storel_pi (__m64* mem_addr, __m128 a)
void _mm_storer_pd (double* mem_addr, __m128d a)
void _mm_storer_ps (float* mem_addr, __m128 a)
void _mm_storeu_pd (double* mem_addr, __m128d a)
void _mm_storeu_ps (float* mem_addr, __m128 a)
void _mm_storeu_si128 (__m128i* mem_addr, __m128i a)
void _mm_storeu_si16 (void* mem_addr, __m128i a)
void _mm_storeu_si32 (void* mem_addr, __m128i a)
void _mm_storeu_si64 (void* mem_addr, __m128i a)
void _mm_stream_pd (double* mem_addr, __m128d a)
void _mm_stream_pi (__m64* mem_addr, __m64 a)
void _mm_stream_ps (float* mem_addr, __m128 a)
void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
void _mm_stream_si32 (int* mem_addr, int a)
void _mm_stream_si64 (__int64* mem_addr, __int64 a)

参考

1、https://www.zhihu.com/column/c_1550937293912748032
2、https://zhuanlan.zhihu.com/p/409973153
3、https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=4880,3865,6557&techs=SSE_ALL

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.coloradmin.cn/o/942175.html

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈,一经查实,立即删除!

相关文章

外部库/lib/maven依赖项 三者关系

外部库(存放项目初始配置的jar包)(它的文件夹里并没有包含lib文件夹的引的外部的依赖的jar包) lib(存放外部导入到项目的依赖的jar包) maven依赖项(管理项目所有的jar包依赖) 三者存放jar包的关系 项目所依赖的全部的jar包 maven依赖项的jar包 外部库中的jar包 lib中的…

基于STM32的酒精浓度检测报警防酒驾仿真设计(仿真+程序+讲解视频)

基于STM32的酒精浓度检测报警防酒驾仿真设计 讲解视频1.主要功能2.仿真3. 程序4. 资料清单&下载链接 基于STM32的酒精浓度检测报警防酒驾仿真设计(仿真程序讲解&#xff09; 仿真图proteus 8.9 程序编译器&#xff1a;keil 5 编程语言&#xff1a;C语言 设计编号&#…

Oracle创建控制列表ACL(Access Control List)

Oracle创建控制列表ACL&#xff08;Access Control List&#xff09; Oracle ACL简介一、先登陆163邮箱设置开启SMTP。二、Oracle ACL控制列表处理&#xff08;一&#xff09;创建ACL&#xff08;create_acl&#xff09;&#xff08;二&#xff09;添加ACL权限&#xff08;add_…

2023热门短剧小剧场APP小程序系统介绍

迈特的短剧saas项目买来能干什么 系统上线推广已经半个月&#xff0c;很多朋友还不懂这项目是干什么的&#xff0c;我来给大家讲一下我所见识的&#xff08;非专业见解&#xff0c;说错了见谅&#xff09; 玩法由来 这种热门短剧玩法在去年就已经出现了&#xff0c;但是今年20…

13.毛玻璃动画特效

效果 源码 <!DOCTYPE html> <html lang="en"> <head><meta charset="UTF-8"><title>Glassmorphism Animation Effects</title><link rel="stylesheet" href="style.css"> </head> &…

最新AI创作系统ChatGPT源码+详细图文部署教程/支持GPT-4/AI绘画/H5端/Prompt知识库/思维导图生成

一、AI系统 如何搭建部署AI创作ChatGPT系统呢&#xff1f;小编这里写一个详细图文教程吧&#xff01;SparkAi使用Nestjs和Vue3框架技术&#xff0c;持续集成AI能力到AIGC系统&#xff01; 1.1 程序核心功能 程序已支持ChatGPT3.5/GPT-4提问、AI绘画、Midjourney绘画&#xf…

sql:SQL优化知识点记录(五)

&#xff08;1&#xff09;explain之例子 &#xff08;2&#xff09;索引单表优化案例 上面的功能已经实现&#xff0c;但是分析功能&#xff0c; 使用explain分析这条sql&#xff1a; 发现type为All Extra&#xff1a;有Using filesort &#xff08;文件内排序&#xff09; 这…

Kubernetes(k8s)当中安装并使用ingress暴露应用

Kubernetes当中安装并使用ingress暴露应用 为什么需要Ingress前期准备集群准备LoadBalancer准备 安装Ingress-Nginx下载地址v1.3.1v1.8.1 修改文件v1.3.1v1.8.1修改ingress服务类型配置 执行安装 部署应用通过ingress-nginx暴露应用部署ingress的yaml文件v1.3.1v1.8.1 为什么需…

E9—TEMAC IP实现千兆网口UDP传输2023-08-28

1.关于IP收费的问题 Tri Mode Ethernet MAC是收费IP&#xff0c;打开IP后&#xff0c;当左下角显示Bought IP license available则IP可用。 2.功能说明 应用搭建的场景是&#xff0c;上位机发送数据&#xff0c;首先发起arp请求&#xff0c;随后下位机给出arp应答响应&#…

设计模式—职责链模式(Chain of Responsibility)

目录 思维导图 什么是职责链模式&#xff1f; 有什么优点呢&#xff1f; 有什么缺点呢&#xff1f; 什么场景使用呢&#xff1f; 代码展示 ①、职责链模式 ②、加薪代码重构 思维导图 什么是职责链模式&#xff1f; 使多个对象都有机会处理请求&#xff0c;从而避免请…

【计算机网络】OSI 七层网络参考模型

OSI&#xff08;Open Systems Interconnection&#xff09;七层网络参考模型是一种用于描述计算机网络通信的框架&#xff0c;将网络通信划分为七个不同的层次&#xff0c;每个层次负责不同的功能。 以下为 OSI 七层网络参考模型的简单表格&#xff1a; --------------------…

Java注解—Annotation

Java注解——Annotation 一、概念 注解也是Java中一种比较特殊的存在&#xff0c;一般可以声明在任何一个位置&#xff0c;用于给我们的代码提供一些说明或者提供一些功能。 Override Deprecated 二、注解使用一般分为三种情况的注解 1、只是用来进行解释说明的注解&#x…

使用 Amazon Lambda 进行无服务器计算:云架构中的一场革命

引言 十年前,无服务器架构还像是痴人说梦。不再如此了! 有了 Amazon Lambda,我们现在可以建构和运行应用程序而不需要考虑服务器。云供应商会无缝地处理所有服务器的供应、扩展和管理。我们只需要关注代码。 这为云部署带来了前所未有的敏捷性、自动化和优化。但是,要发挥它的…

如何解决索引分裂问题?

索引分裂 索引块快写满时就会发生索引分裂&#xff0c;索引分裂分为两种情况&#xff0c;55和91&#xff1a; 索引分裂和enq: TX - index contension等待事件的区别 无论是55还是91&#xff0c;都是数据增多后索引的正常行为&#xff0c;索引分裂是业务数据量增大导致索引增大…

机器学习-神经网络(西瓜书)

神经网络 5.1 神经元模型 在生物神经网络中&#xff0c;神经元之间相互连接&#xff0c;当一个神经元受到的外界刺激足够大时&#xff0c;就会产生兴奋&#xff08;称为"激活"&#xff09;&#xff0c;并将剩余的"刺激"向相邻的神经元传导。 神经元模型…

Cento7 Docker-compose安装RabbitMQ

RabbitMQ是一个消息中间件&#xff0c;是用Erlang语言编写的。RabbitMQ据说具有良好的性能和时效性&#xff0c;同时还能够非常好的支持集群和负载部署&#xff0c;非常适合在较大规模的分布式系统中使用。接下来我们就以docker形式安装。 1.先安装docker环境 yum -y install…

MATLAB算法实战应用案例精讲-【自然语言处理】语义分割模型-DeepLabV3

目录 1、DeepLab系列简介 1.1.DeepLabV1 1.1.1创新点&#xff1a; 1.1.2. 动机&#xff1a; 1.1.3. 应对策略&#xff1a; 1.2.DeepLabV2 1.2.1.创新点&#xff1a; 1.2.2.动机 1.2.3. 应对策略&#xff1a; 1.3.DeepLabV3 1.3.1创新点&#xff1a; 1.3.2. 动机&am…

【大魔王送书第一期】《一名阿里服务端开发工程师的进阶之路》

一、前言 目前&#xff0c;资讯、社交、游戏、消费、出行等丰富多彩的互联网应用已经渗透到了人们生活和工作的方方面面&#xff0c;正深刻改变着信息时代。随着用户规模的增长和应用复杂度的上升&#xff0c;服务端面临的技术挑战越来越严峻。在头部互联网企业&#xff0c;服…

Docker:Harbor 私有仓库迁移

Harbor 私有仓库迁移 一.私有仓库迁移的介绍 1.为何要对Harbor 私有仓库的迁移 &#xff08;1&#xff09;硬件升级或更换&#xff1a;如果源 Harbor 在旧的硬件设备上运行&#xff0c;并且计划将其迁移到新的硬件设备上&#xff0c;那么需要执行迁移操作。 &#xff08;2&…

17万字集团大数据平台整体方案word

导读&#xff1a;原文《17万字集团大数据平台整体方案word》&#xff08;获取来源见文尾&#xff09;&#xff0c;本文精选其中精华及架构部分&#xff0c;逻辑清晰、内容完整&#xff0c;为快速形成售前方案提供参考。以下是部分内容&#xff0c; 1.1.1 总体目标 根据集团信…