PIPS
neon.h
Go to the documentation of this file.
1 #include <arm_neon.h>
2 
3 
4 /* Uses 128-bits NEON instructions.
5 Notes :
6  * NEON can also operate on 64-bits vectors.
7  * NEON does not operate on double-precision float. However, VFP can work on double-precision 64-bits vectors,
8  but VFP is not a simd unit : it processes vectors scalar by scalar.
9  * __reminder__ : NEON data types : signed/unsigned 8-bit, 16-bit, 32-bit, 64-bit, single precision floating point
10  * TODO: alignement: [1] says that each instruction has an alignement offset argument., but I can't find it in
11  the intrinsics... An other option is to used an isntruction that set the alignement offset before each call,
12  but it sounds like wasted cycles...
13  * TODO: a feature of NEON is to load/store up to 4 vectors with just one instruction
14  (see vst{1,2,3,4}/vld{1,2,3,4} variants and [2] for examples). This needs (I think) a modification in SAC.
15  * [2] is a nice summary of the intrinsics used here
16 
17  [1] : http://infocenter.arm.com/help/topic/com.arm.doc.dui0489b/CIHGIAEH.html
18  [2] : http://gcc.gnu.org/onlinedocs/gcc/ARM-NEON-Intrinsics.html
19  */
20 
21 
22 typedef float32_t a4sf[8] __attribute__ ((aligned (32)));
23 typedef int64_t a2di[4] __attribute__ ((aligned (32)));
24 typedef int32_t a4si[8] __attribute__ ((aligned (32)));
25 typedef int16_t a8hi[16] __attribute__ ((aligned (32)));
26 typedef int8_t a16qi[32] __attribute__ ((aligned (32)));
27 
28 typedef float32x4_t v4sf;
29 typedef int64x2_t v2di;
30 typedef int32x4_t v4si;
31 typedef int16x8_t v8hi;
32 typedef int8x16_t v16qi;
33 
34 /* float */
35 #define SIMD_LOAD_V4SF(vec,arr) vec=vld1q_f32(arr)
36 #define SIMD_LOADA_V4SF(vec,arr) vec=vld1q_f32(arr)
37 #define SIMD_MULPS(vec1,vec2,vec3) vec1=vmulq_f32(vec2,vec3)
38 #define SIMD_DIVPS(vec1,vec2,vec3)\
39  do {\
40  vec3=vrecpeq_f32(vec3);\
41  vec1=vmulq_f32(vec2,vec3);\
42  }\
43  while (0)
44 
45 #define SIMD_ADDPS(vec1,vec2,vec3) vec1=vaddq_f32(vec2,vec3)
46 #define SIMD_SUBPS(vec1, vec2, vec3) vec1=vsubq_f32(vec2, vec3)
47 #define SIMD_MULADDPS(vec1, vec2, vec3, vec4) vec1=vmlaq_f32(vec2,vec3,vec4)
48 #define SIMD_UMINPS(vec1, vec2) vec1=vnegq_f32(vec2)
49 #define SIMD_STORE_V4SF(vec,arr) vst1q_f32(arr,vec)
50 #define SIMD_STOREA_V4SF(vec,arr) vst1q_f32(arr,vec)
51 #define SIMD_STORE_GENERIC_V4SF(vec,v0,v1,v2,v3) \
52  do { \
53  float __pips_tmp[4] __attribute__ ((aligned (16))); \
54  SIMD_STOREA_V4SF(vec,&__pips_tmp[0]); \
55  *(v0)=__pips_tmp[0]; \
56  *(v1)=__pips_tmp[1]; \
57  *(v2)=__pips_tmp[2]; \
58  *(v3)=__pips_tmp[3]; \
59  } while (0)
60 
61 #define SIMD_ZERO_V4SF(vec) vec = vsubq_f32(vec,vec)
62 
63 #define SIMD_LOAD_GENERIC_V4SF(vec,v0,v1,v2,v3) \
64  do { \
65  float __pips_v[4] __attribute ((aligned (16)));\
66  __pips_v[0]=v0;\
67  __pips_v[1]=v1;\
68  __pips_v[2]=v2;\
69  __pips_v[3]=v3;\
70  SIMD_LOADA_V4SF(vec,&__pips_v[0]); \
71  } while(0)
72 
73 /* handle padded value, this is a very bad implementation ... */
74 #define SIMD_STORE_MASKED_V4SF(vec,arr) \
75  do { \
76  float __pips_tmp[4] __attribute__ ((aligned (16))); \
77  SIMD_STOREA_V4SF(vec,&__pips_tmp[0]); \
78  (arr)[0] = __pips_tmp[0]; \
79  (arr)[1] = __pips_tmp[1]; \
80  (arr)[2] = __pips_tmp[2]; \
81  } while(0)
82 
83 #define SIMD_LOAD_V4SI_TO_V4SF(v, f) \
84  do { \
85  float __pips_tmp[4]; \
86  __pips_tmp[0] = (f)[0]; \
87  __pips_tmp[1] = (f)[1]; \
88  __pips_tmp[2] = (f)[2]; \
89  __pips_tmp[3] = (f)[3]; \
90  SIMD_LOAD_V4SF(v, __pips_tmp); \
91  } while(0)
92 
93 /* int64_t */
94 #define SIMD_LOAD_V2DI(vec,arr) vec=vld1q_s64(arr)
95 #define SIMD_STORE_V2DI(vec,arr) vst1q_s64(arr,vec)
96 
97 #define SIMD_ZERO_V2DI(vec) vec = veorq_s64(vec,vec)
98 
99 #define SIMD_ADDDI(v1,v2,v3) v1=vaddq_s64(v2,v3)
100 #define SIMD_SUBDI(v1,v2,v3) v1=vsubq_s64(v2,v3)
101 #define SIMD_DIVDI(vec1,vec2,vec3)\
102  do {\
103  vec3=vrecpeq_s64(vec3);\
104  vec1=vmulq_s64(vec2,vec3);\
105  }\
106  while (0)
107 #define SIMD_MULDI(v1,v2,v3) v1=vmulq_s64(v2,v3)
108 #define SIMD_MULADDDI(vec1, vec2, vec3, vec4) vec1=vmlaq_s64(vec2,vec3,vec4)
109 
110 /* int32_t */
111 #define SIMD_LOAD_V4SI(vec,arr) vec=vld1q_s32(arr)
112 #define SIMD_STORE_V4SI(vec,arr) vst1q_s32(arr,vec)
113 
114 #define SIMD_ZERO_V4SI(vec) vec = veorq_s32(vec,vec)
115 
116 #define SIMD_ADDD(v1,v2,v3) v1=vaddq_s32(v2,v3)
117 #define SIMD_SUBD(v1,v2,v3) v1=vsubq_s32(v2,v3)
118 #define SIMD_DIVD(vec1,vec2,vec3)\
119  do {\
120  vec3=vrecpeq_s32(vec3);\
121  vec1=vmulq_s32(vec2,vec3);\
122  }\
123  while (0)
124 #define SIMD_MULD(v1,v2,v3) v1=vmulq_s32(v2,v3)
125 #define SIMD_MULADDD(vec1, vec2, vec3, vec4) vec1=vmlaq_s32(vec2,vec3,vec4)
126 
127 /* int16_t */
128 #define SIMD_LOAD_V8HI(vec,arr) vec=vld1q_s16(arr)
129 #define SIMD_STORE_V8HI(vec,arr) vst1q_s16(arr,vec)
130 
131 #define SIMD_ZERO_V8HI(vec) vec = veorq_s16(vec,vec)
132 
133 #define SIMD_ADDHI(v1,v2,v3) v1=vaddq_s16(v2,v3)
134 #define SIMD_SUBHI(v1,v2,v3) v1=vsubq_s16(v2,v3)
135 #define SIMD_DIVHI(vec1,vec2,vec3)\
136  do {\
137  vec3=vrecpeq_s16(vec3);\
138  vec1=vmulq_s16(vec2,vec3);\
139  }\
140  while (0)
141 #define SIMD_MULHI(v1,v2,v3) v1=vmulq_s16(v2,v3)
142 
143 #define SIMD_STORE_V8HI_TO_V8SI(vec,arr)\
144  SIMD_STORE_V8HI(vec,arr)
145 #define SIMD_LOAD_V8SI_TO_V8HI(vec,arr)\
146  SIMD_LOAD_V8HI(vec,arr)
147 
148 #define SIMD_MULADDHI(vec1, vec2, vec3, vec4) vec1=vmlaq_s16(vec2,vec3,vec4)
149 
150 /* int8_t */
151 #define SIMD_LOAD_V16QI(vec,arr) vec=vld1q_s8(arr)
152 #define SIMD_STORE_V16QI(vec,arr) vst1q_s8(arr,vec)
153 
154 #define SIMD_ZERO_V16QI(vec) vec = veorq_s8(vec,vec)
155 
156 #define SIMD_ADDQI(v1,v2,v3) v1=vaddq_s8(v2,v3)
157 #define SIMD_SUBQI(v1,v2,v3) v1=vsubq_s8(v2,v3)
158 #define SIMD_DIVQI(vec1,vec2,vec3)\
159  do {\
160  vec3=vrecpeq_s8(vec3);\
161  vec1=vmulq_s8(vec2,vec3);\
162  }\
163  while (0)
164 #define SIMD_MULQI(v1,v2,v3) v1=vmulq_s8(v2,v3)
165 
166 #define SIMD_MULADDQI(vec1, vec2, vec3, vec4) vec1=vmlaq_s8(vec2,vec3,vec4)
float a4sf[4]
Definition: SIMD_types.h:5
int a4si[4]
Definition: SIMD_types.h:8
float32x4_t v4sf
Definition: neon.h:28
float32_t a4sf[8] __attribute__((aligned(32)))
Uses 128-bits NEON instructions.
Definition: neon.h:22
int16x8_t v8hi
Definition: neon.h:31
int64x2_t v2di
Definition: neon.h:29
int8x16_t v16qi
Definition: neon.h:32
int32x4_t v4si
Definition: neon.h:30
#define int32_t
Definition: stdint.in.h:155
#define int8_t
Definition: stdint.in.h:141
#define int16_t
Definition: stdint.in.h:148