- __m128i
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i {
__int8 m128i_i8[16];
__int16 m128i_i16[8];
__int32 m128i_i32[4];
__int64 m128i_i64[2];
unsigned __int8 m128i_u8[16];
unsigned __int16 m128i_u16[8];
unsigned __int32 m128i_u32[4];
unsigned __int64 m128i_u64[2];
} __m128i;
#if !defined(_CRT_ALIGN)
#if defined(__midl)
#define _CRT_ALIGN(x)
#else
#define _CRT_ALIGN(x) __declspec(align(x))
#endif
#endif
- SSE2
- __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b)
- __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b)
- __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b)
- __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b)
- __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b)
- __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b)
- __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b)
- __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b)
- void _mm_storeu_si128 (__m128i* mem_addr, __m128i a)
- __m128i _mm_xor_si128 (__m128i a, __m128i b)
- __m128i _mm_or_si128 (__m128i a, __m128i b)
- __m128i _mm_and_si128 (__m128i a, __m128i b)
- __m128i _mm_slli_epi16 (__m128i a, int imm8)
- __m128i _mm_srli_epi16 (__m128i a, int imm8)
- __m128i _mm_add_epi16 (__m128i a, __m128i b)
- __m128i _mm_adds_epi16 (__m128i a, __m128i b)
- 两个加法的不同在于,add会舍弃进位,adds会在溢出时将结果设置为最大值或最小值
#include <stdio.h>
#include <emmintrin.h>
int main (void ) {
__m128i a = _mm_set1_epi8(0x11);
return 0;
}
Compile with
$ gcc -march=native test.c
- vld4_16
uint16x4x4 vld4_16(const uint16_t *)
Form of expected instructions: vld4.16 {d0, d1, d2, d3}, [r0]
- vld4q_u16
uint16x8x4_t vld4q_u16 (const uint16_t *)
Form of expected instruction(s): vld4.16 {d0, d1, d2, d3}, [r0]
- vld4_lane_u16
uit16x4x4_t vld4_lane_u16 (const uint16_t *, uint16x4x4_t, const int)
Form of expected instruction(s): vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
- 从一个复数点积算法看NEON的汇编优化向量乘法加法示例
- AARCH64 ARMV8 NEON的变动
- v8-v15在子函数调用时必须要保留,但是只保留低64bits
- Procedure Call Standard for the ARM 64-bit Architecture
- Getting to know ARM64 NEON一个向量加法示例
- neon-simple-guide
- NEON and VFP编程指令含义中文介绍, page 323
- Cortex-A9 NEON Media Processing Engine Revision: r4p1指令时间说明
- ARM中文社区
- ARM NEON optimization, ARM社区
- 示例:Remove data dependencies
- 示例:Reduce branched
- NEON assembly and intrinsic优劣势比较
- ARM NEON programming quick reference, ARM社区
- AMMv7-A, ARMv8-A AArch32, ARMv8-A AArch64寄存器、指令结构
- NEON汇编示例:Assembly files or Inline assembly
- Coding for NEON - Part 1: load and stores
- Coding for NEON - Part 2: Dealing With Leftovers
- Coding for NEON - Part 4: Shifting Left and Right
- Coding for NEON - Part 5: Rearranging Vectors
- ARM NEON编程初探——一个简单的BGR888转YUV444实例详解
- ARM NEON Programmer's Reading Guide
- ARM NEON tips
- An Introduction to ARM NEON
- ARM NEON support in the ARM compiler White Paper
- ARM Compiler Version 5.06 armasm user guide指令描述更加详细,有图示
- objdump is a program for displaying various information about object files on Unix-like systems
- How to check the existence of NEON on arm?
- ARMv8 Instruction Set Overview
- ARM Assembly Local Labels
- Useful assembler directives and macros for the GNU assembler
- A64 Shift and Extend Operations: Operand Modifiers
- (vip)Accelerated AES for the ARM64 Linux kernel, AES-NEON性能评估,代码作者撰写
- makefiles CFLAGES -D, 编译宏定义