PIPS
sse.h
Go to the documentation of this file.
1 #include <xmmintrin.h>
2 #include <emmintrin.h>
3 
4 typedef float a2sf[2] __attribute__ ((aligned (16)));
5 typedef float a4sf[4] __attribute__ ((aligned (16)));
6 typedef double a2df[2] __attribute__ ((aligned (16)));
7 typedef int a4si[4] __attribute__ ((aligned (16)));
8 typedef int a8si[4] __attribute__ ((aligned (16)));
9 
10 typedef __m128 v4sf;
11 typedef __m128d v2df;
12 typedef __m128i v4si;
13 typedef __m128i v8si;
14 typedef __m128i v8hi;
15 /* int */
16 #define SIMD_LOAD_V4SI(vec,arr) vec=_mm_loadu_si128((__m128i*)arr)
17 #define SIMD_LOADA_V4SI(vec,arr) vec=_mm_load_si128((__m128i*)arr)
18 #define SIMD_LOAD_BROADCAST_V4SI(vec,val) vec=_mm_set1_si128(val)
19 #define SIMD_MULD(vec1,vec2,vec3) vec1=_mm_mul_epi32(vec2,vec3)
20 #define SIMD_ADDD(vec1,vec2,vec3) vec1=_mm_add_epi32(vec2,vec3)
21 #define SIMD_SUBD(vec1, vec2, vec3) vec1 = _mm_sub_epi32(vec2, vec3)
22 
23 #define SIMD_STORE_V4SI(vec,arr) _mm_storeu_si128((__m128i*)arr,vec)
24 #define SIMD_STOREA_V4SI(vec,arr) _mm_store_si128((__m128i*)arr,vec)
25 
26 /* float */
27 #define SIMD_LOAD_V4SF(vec,arr) vec=_mm_loadu_ps(arr)
28 #define SIMD_LOADA_V4SF(vec,arr) vec=_mm_load_ps(arr)
29 #define SIMD_LOAD_BROADCAST_V4SF(vec,val) vec=_mm_set1_ps(val)
30 #define SIMD_MULPS(vec1,vec2,vec3) vec1=_mm_mul_ps(vec2,vec3)
31 #define SIMD_DIVPS(vec1,vec2,vec3) vec1=_mm_div_ps(vec2,vec3)
32 #define SIMD_ADDPS(vec1,vec2,vec3) vec1=_mm_add_ps(vec2,vec3)
33 #define SIMD_SUBPS(vec1, vec2, vec3) vec1 = _mm_sub_ps(vec2, vec3)
34 #define SIMD_MULADDPS(vec1, vec2, vec3, vec4) \
35  do { \
36  __m128 __pips_tmp;\
37  SIMD_MULPS(__pips_tmp, vec3, vec4);\
38  SIMD_ADDPS(vec1, __pips_tmp, vec2); \
39  } while(0)
40 
41 #define SIMD_SHUFFLE_V4SF(dist,src,i0,i1,i2,i3) dist=_mm_shuffle_ps(src,src,_MM_SHUFFLE(i3,i2,i1,i0)
42 
43 /* umin as in unary minus */
44 #define SIMD_UMINPS(vec1, vec2) \
45  do { \
46  __m128 __pips_tmp; \
47  __pips_tmp = _mm_setzero_ps(); \
48  vec1 = _mm_sub_ps(__pips_tmp, vec2); \
49  } while(0)
50 
51 #define SIMD_STORE_V4SF(vec,arr) _mm_storeu_ps(arr,vec)
52 #define SIMD_STOREA_V4SF(vec,arr) _mm_store_ps(arr,vec)
53 #define SIMD_STORE_GENERIC_V4SF(vec,v0,v1,v2,v3) \
54  do { \
55  float __pips_tmp[4] __attribute__ ((aligned (16))); \
56  SIMD_STOREA_V4SF(vec,&__pips_tmp[0]); \
57  *(v0)=__pips_tmp[0]; \
58  *(v1)=__pips_tmp[1]; \
59  *(v2)=__pips_tmp[2]; \
60  *(v3)=__pips_tmp[3]; \
61  } while (0)
62 
63 #define SIMD_ZERO_V4SF(vec) vec = _mm_setzero_ps()
64 #define SIMD_INVERT_V4SF(vec) vec = _mm_shuffle_ps(vec,vec,_MM_SHUFFLE(4,3,2,1))
65 
66 #define SIMD_LOAD_GENERIC_V4SF(vec,v0,v1,v2,v3) \
67  do { \
68  float __pips_v[4] __attribute ((aligned (16)));\
69  __pips_v[0]=v0;\
70  __pips_v[1]=v1;\
71  __pips_v[2]=v2;\
72  __pips_v[3]=v3;\
73  SIMD_LOADA_V4SF(vec,&__pips_v[0]); \
74  } while(0)
75 
76 /* handle padded value, this is a very bad implementation ... */
77 #define SIMD_STORE_MASKED_V4SF(vec,arr) \
78  do { \
79  float __pips_tmp[4] __attribute__ ((aligned (16))); \
80  SIMD_STOREA_V4SF(vec,&__pips_tmp[0]); \
81  (arr)[0] = __pips_tmp[0]; \
82  (arr)[1] = __pips_tmp[1]; \
83  (arr)[2] = __pips_tmp[2]; \
84  } while(0)
85 
86 #define SIMD_LOAD_V4SI_TO_V4SF(v, f) \
87  do { \
88  float __pips_tmp[4]; \
89  __pips_tmp[0] = (f)[0]; \
90  __pips_tmp[1] = (f)[1]; \
91  __pips_tmp[2] = (f)[2]; \
92  __pips_tmp[3] = (f)[3]; \
93  SIMD_LOAD_V4SF(v, __pips_tmp); \
94  } while(0)
95 
96 /* double */
97 #define SIMD_LOAD_V2DF(vec,arr) vec=_mm_loadu_pd(arr)
98 #define SIMD_MULPD(vec1,vec2,vec3) vec1=_mm_mul_pd(vec2,vec3)
99 #define SIMD_ADDPD(vec1,vec2,vec3) vec1=_mm_add_pd(vec2,vec3)
100 #define SIMD_MULADDPD(vec1, vec2, vec3, vec4) \
101  do { \
102  __m128 __pips_tmp;\
103  SIMD_MULPD(__pips_tmp, vec3, vec4);\
104  SIMD_ADDPD(vec1, __pips_tmp, vec2); \
105  } while(0)
106 #define SIMD_UMINPD(vec1, vec2) \
107  do { \
108  __m128d __pips_tmp; \
109  __pips_tmp = _mm_setzero_pd(); \
110  vec1 = _mm_sub_pd(__pips_tmp, vec2); \
111  } while(0)
112 
113 #define SIMD_COSPD(vec1, vec2) \
114  do { \
115  double __pips_tmp[2] __attribute__ ((aligned (16))); \
116  SIMD_STORE_V2DF(vec2, __pips_tmp); \
117  __pips_tmp[0] = cos(__pips_tmp[0]); \
118  __pips_tmp[1] = cos(__pips_tmp[1]); \
119  SIMD_LOAD_V2DF(vec1, __pips_tmp); \
120  } while(0)
121 
122 #define SIMD_SINPD(vec1, vec2) \
123  do { \
124  double __pips_tmp[2] __attribute__ ((aligned (16))); \
125  SIMD_STORE_V2DF(vec2, __pips_tmp); \
126  __pips_tmp[0] = sin(__pips_tmp[0]); \
127  __pips_tmp[1] = sin(__pips_tmp[1]); \
128  SIMD_LOAD_V2DF(vec1, __pips_tmp); \
129  } while(0)
130 
131 #define SIMD_STORE_V2DF(vec,arr) _mm_storeu_pd(arr,vec)
132 #define SIMD_STORE_GENERIC_V2DF(vec, v0, v1) \
133  do { \
134  double __pips_tmp[2]; \
135  SIMD_STORE_V2DF(vec,&__pips_tmp[0]); \
136  *(v0)=__pips_tmp[0]; \
137  *(v1)=__pips_tmp[1]; \
138  } while (0)
139 #define SIMD_LOAD_GENERIC_V2DF(vec,v0,v1) \
140  do { \
141  double v[2] = { v0,v1}; \
142  SIMD_LOAD_V2DF(vec,&v[0]); \
143  } while(0)
144 
145 /* conversions */
146 #define SIMD_STORE_V2DF_TO_V2SF(vec,f) \
147  do { \
148  double __pips_tmp[2]; \
149  SIMD_STORE_V2DF(vec, __pips_tmp); \
150  (f)[0] = __pips_tmp[0]; \
151  (f)[1] = __pips_tmp[1]; \
152  } while(0)
153 
154 #define SIMD_LOAD_V2SF_TO_V2DF(vec,f) \
155  SIMD_LOAD_GENERIC_V2DF(vec,(f)[0],(f)[1])
156 
157 /* char */
158 #define SIMD_LOAD_V8HI(vec,arr) \
159  vec = (__m128i*)(arr)
160 
161 #define SIMD_STORE_V8HI(vec,arr)\
162  *(__m128i *)(&(arr)[0]) = vec
163 
164 #define SIMD_STORE_V8HI_TO_V8SI(vec,arr)\
165  SIMD_STORE_V8HI(vec,arr)
166 #define SIMD_LOAD_V8SI_TO_V8HI(vec,arr)\
167  SIMD_LOAD_V8HI(vec,arr)
168 
169 
float a4sf[4]
Definition: SIMD_types.h:5
int a8si[8]
Definition: SIMD_types.h:26
int a4si[4]
Definition: SIMD_types.h:8
float a2sf[2]
Definition: SIMD_types.h:4
double a2df[2]
Definition: SIMD_types.h:6
__m128d v2df
Definition: sse.h:11
float a2sf[2] __attribute__((aligned(16)))
Definition: sse.h:4
__m128i v4si
Definition: sse.h:12
__m128 v4sf
Definition: sse.h:10
__m128i v8hi
Definition: sse.h:14
__m128i v8si
Definition: sse.h:13