PIPS
avx.h
Go to the documentation of this file.
1 #include <immintrin.h>
2 
3 
4 typedef double a4df[4] __attribute__ ((aligned (32)));
5 typedef float a8sf[8] __attribute__ ((aligned (32)));
6 typedef float a4sf[4] __attribute__ ((aligned (32)));
7 typedef long long a4di[4] __attribute__ ((aligned (32)));
8 typedef int a8si[8] __attribute__ ((aligned (32)));
9 typedef short a16hi[16] __attribute__ ((aligned (32)));
10 typedef char a32qi[32] __attribute__ ((aligned (32)));
11 
12 typedef __m256d v4df;
13 typedef __m256 v8sf;
14 typedef __m128 v4sf;
15 typedef __m256i v4di;
16 typedef __m256i v8si;
17 typedef __m256i v16hi;
18 typedef __m256i v32qi;
19 
20 /* float */
21 #define SIMD_LOAD_V8SF(vec,arr) vec=_mm256_loadu_ps(arr)
22 #define SIMD_LOAD_BROADCAST_V8SF(vec,arr) vec=_mm256_set1_ps(arr)
23 #define SIMD_LOAD_BROADCAST_V4DF(vec,arr) vec=_mm256_set1_pd(arr)
24 #define SIMD_LOADA_V8SF(vec,arr) vec=_mm256_load_ps(arr)
25 #define SIMD_MULPS(vec1,vec2,vec3) vec1=_mm256_mul_ps(vec2,vec3)
26 #define SIMD_DIVPS(vec1,vec2,vec3) vec1=_mm256_div_ps(vec2,vec3)
27 #define SIMD_ADDPS(vec1,vec2,vec3) vec1=_mm256_add_ps(vec2,vec3)
28 #define SIMD_SUBPS(vec1, vec2, vec3) vec1 = _mm256_sub_ps(vec2, vec3)
29 #define SIMD_MULADDPS(vec1, vec2, vec3, vec4) \
30  do { \
31  __m256 __pips_tmp;\
32  SIMD_MULPS(__pips_tmp,vec3,vec4); \
33  SIMD_ADDPS(vec1,__pips_tmp,vec2); \
34  } while(0)
35 
36 #define SIMD_SHUFFLE_V8SF(dist,src,i0,i1,i2,i3) _mm256_shuffle_pd(src,src,_MM_SHUFFLE(i3,i2,i1,i0))
37 #define SIMD_SHUFFLE_V4SF(dist,src,i0,i1,i2,i3) _mm256_shuffle_ps(src,src,_MM_SHUFFLE(i3,i2,i1,i0))
38 
39 
40 /* umin as in unary minus */
41 #define SIMD_UMINPS(vec1, vec2) \
42  do { \
43  __m256 __pips_tmp; \
44  __pips_tmp = _mm256_setzero_ps(); \
45  vec1 = _mm256_sub_ps(__pips_tmp, vec2); \
46  } while(0)
47 
48 #define SIMD_STORE_V8SF(vec,arr) _mm256_storeu_ps(arr,vec)
49 #define SIMD_STOREA_V8SF(vec,arr) _mm256_store_ps(arr,vec)
50 #define SIMD_STORE_GENERIC_V8SF(vec,v0,v1,v2,v3,v4,v5,v6,v7) \
51  do { \
52  float __pips_tmp[4] __attribute__ ((aligned (32))); \
53  SIMD_STOREA_V8SF(vec,&__pips_tmp[0]); \
54  *(v0)=__pips_tmp[0]; \
55  *(v1)=__pips_tmp[1]; \
56  *(v2)=__pips_tmp[2]; \
57  *(v3)=__pips_tmp[3]; \
58  *(v4)=__pips_tmp[4]; \
59  *(v5)=__pips_tmp[5]; \
60  *(v6)=__pips_tmp[6]; \
61  *(v7)=__pips_tmp[7]; \
62  } while (0)
63 
64 #define SIMD_ZERO_V8SF(vec) vec = _mm256_setzero_ps()
65 #define SIMD_LOAD_GENERIC_V8SF(vec,v0,v1,v2,v3,v4,v5,v6,v7) \
66  do { \
67  float __pips_v[8] __attribute ((aligned (32)));\
68  vec=_mm256_set_ps(v0,v1,v2,v3,v4,v5,v6,v7);\
69  } while(0)
70 
71 #define SIMD_LOAD_V8SI_TO_V8SF(v, f) \
72  do { \
73  float __pips_tmp[8]; \
74  __pips_tmp[0] = (f)[0]; \
75  __pips_tmp[1] = (f)[1]; \
76  __pips_tmp[2] = (f)[2]; \
77  __pips_tmp[3] = (f)[3]; \
78  __pips_tmp[4] = (f)[4]; \
79  __pips_tmp[5] = (f)[5]; \
80  __pips_tmp[6] = (f)[6]; \
81  __pips_tmp[7] = (f)[7]; \
82  SIMD_LOAD_V8SF(v, __pips_tmp); \
83  } while(0)
84 
85 /* double */
86 #define SIMD_LOAD_V4DF(vec,arr) vec=_mm256_loadu_pd(arr)
87 #define SIMD_MULPD(vec1,vec2,vec3) vec1=_mm256_mul_pd(vec2,vec3)
88 #define SIMD_ADDPD(vec1,vec2,vec3) vec1=_mm256_add_pd(vec2,vec3)
89 #define SIMD_MULADDPD(vec1, vec2, vec3, vec4) \
90  do { \
91  __m256d __pips_tmp;\
92  SIMD_MULPD(__pips_tmp,vec3,vec4); \
93  SIMD_ADDPD(vec1,__pips_tmp,vec2); \
94  } while(0)
95 #define SIMD_UMINPD(vec1, vec2) \
96  do { \
97  __m256d __pips_tmp; \
98  __pips_tmp = _mm256_setzero_pd(); \
99  vec1 = _mm256_sub_pd(__pips_tmp, vec2); \
100  } while(0)
101 
102 #define SIMD_COSPD(vec1, vec2) \
103  do { \
104  double __pips_tmp[4] __attribute__ ((aligned (16))); \
105  SIMD_STORE_V4DF(vec2, __pips_tmp); \
106  __pips_tmp[0] = cos(__pips_tmp[0]); \
107  __pips_tmp[1] = cos(__pips_tmp[1]); \
108  __pips_tmp[2] = cos(__pips_tmp[2]); \
109  __pips_tmp[3] = cos(__pips_tmp[3]); \
110  SIMD_LOAD_V4DF(vec1, __pips_tmp); \
111  } while(0)
112 
113 #define SIMD_SINPD(vec1, vec2) \
114  do { \
115  double __pips_tmp[4] __attribute__ ((aligned (16))); \
116  SIMD_STORE_V4DF(vec2, __pips_tmp); \
117  __pips_tmp[0] = sin(__pips_tmp[0]); \
118  __pips_tmp[1] = sin(__pips_tmp[1]); \
119  __pips_tmp[2] = sin(__pips_tmp[2]); \
120  __pips_tmp[3] = sin(__pips_tmp[3]); \
121  SIMD_LOAD_V4DF(vec1, __pips_tmp); \
122  } while(0)
123 
124 #define SIMD_STORE_V4DF(vec,arr) _mm256_storeu_pd(arr,vec)
125 #define SIMD_STORE_GENERIC_V4DF(vec, v0, v1, v2, v3) \
126  do { \
127  double __pips_tmp[4]; \
128  SIMD_STORE_V4DF(vec,&__pips_tmp[0]); \
129  *(v0)=__pips_tmp[0]; \
130  *(v1)=__pips_tmp[1]; \
131  *(v2)=__pips_tmp[2]; \
132  *(v3)=__pips_tmp[3]; \
133  } while (0)
134 
135 #define SIMD_LOAD_GENERIC_V4DF(vec,v0,v1,v2,v3) \
136  do { \
137  vec=_mm256_set_pd(v0,v1,v2,v3);\
138  } while(0)
139 
140 /* conversions */
141 #define SIMD_STORE_V4DF_TO_V4SF(vec,f) \
142  do { \
143  double __pips_tmp[4]; \
144  SIMD_STORE_V4DF(vec, __pips_tmp); \
145  (f)[0] = __pips_tmp[0]; \
146  (f)[1] = __pips_tmp[1]; \
147  (f)[2] = __pips_tmp[2]; \
148  (f)[3] = __pips_tmp[3]; \
149  } while(0)
150 
151 #define SIMD_LOAD_V4SF_TO_V4DF(vec,f) \
152  do {\
153  __m128 vecsf = _mm_load_ps(f);\
154  vec=_mm256_cvtps_pd(vecsf) ; \
155  } while(0)
156 
157 /* long long */
158 #define SIMD_LOADA_V4DI(vec,arr) \
159  vec=_mm256_load_si256(arr)
160 
161 #define SIMD_STOREA_V4DI(vec,arr)\
162  vec=_mm256_store_si256(arr)
163 
164 #define SIMD_LOAD_V4DI(vec,arr) \
165  vec=_mm256_loadu_si256(arr)
166 
167 #define SIMD_STORE_V4DI(vec,arr) \
168  vec=_mm256_storeu_si256(arr)
169 
170 
171 /* int */
172 #define SIMD_LOADA_V8SI(vec,arr) \
173  vec=_mm256_load_si256(arr)
174 
175 #define SIMD_STOREA_V8SI(vec,arr)\
176  vec=_mm256_store_si256(arr)
177 
178 #define SIMD_LOAD_V8SI(vec,arr) \
179  vec=_mm256_loadu_si256(arr)
180 
181 #define SIMD_STORE_V8SI(vec,arr) \
182  vec=_mm256_storeu_si256(arr)
183 
184 /* short */
185 #define SIMD_LOADA_V16HI(vec,arr) \
186  vec=_mm256_load_si256(arr)
187 
188 #define SIMD_STOREA_V16HI(vec,arr)\
189  vec=_mm256_store_si256(arr)
190 
191 #define SIMD_LOAD_V16HI(vec,arr) \
192  vec=_mm256_loadu_si256(arr)
193 
194 #define SIMD_STORE_V16HI(vec,arr) \
195  vec=_mm256_storeu_si256(arr)
196 
197 /* char */
198 #define SIMD_LOADA_V32QI(vec,arr) \
199  vec=_mm256_load_si256(arr)
200 
201 #define SIMD_STOREA_V32QI(vec,arr)\
202  vec=_mm256_store_si256(arr)
203 
204 #define SIMD_LOAD_V32QI(vec,arr) \
205  vec=_mm256_loadu_si256(arr)
206 
207 #define SIMD_STORE_V32QI(vec,arr) \
208  vec=_mm256_storeu_si256(arr)
209 
__m256d v4df
Definition: avx.h:12
double a4df[4] __attribute__((aligned(32)))
Definition: avx.h:4
__m256i v4di
Definition: avx.h:15
__m256i v16hi
Definition: avx.h:17
__m256i v32qi
Definition: avx.h:18
__m256 v8sf
Definition: avx.h:13
__m128 v4sf
Definition: avx.h:14
__m256i v8si
Definition: avx.h:16
double a4df[4]
Definition: SIMD_types.h:23
float a4sf[4]
Definition: SIMD_types.h:5
char a32qi[32]
Definition: SIMD_types.h:28
int a8si[8]
Definition: SIMD_types.h:26
short a16hi[16]
Definition: SIMD_types.h:27
float a8sf[8]
Definition: SIMD_types.h:24
long long a4di[4]
Definition: SIMD_types.h:25