1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | |
15 | |
16 | |
17 | |
18 | |
19 | |
20 | |
21 | |
22 | |
23 | |
24 | #ifndef __IMMINTRIN_H |
25 | #error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead." |
26 | #endif |
27 | |
28 | #ifndef __AVX512VLDQINTRIN_H |
29 | #define __AVX512VLDQINTRIN_H |
30 | |
31 | |
32 | #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(128))) |
33 | #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(256))) |
34 | |
35 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
36 | _mm256_mullo_epi64 (__m256i __A, __m256i __B) { |
37 | return (__m256i) ((__v4du) __A * (__v4du) __B); |
38 | } |
39 | |
40 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
41 | _mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { |
42 | return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, |
43 | (__v4di)_mm256_mullo_epi64(__A, __B), |
44 | (__v4di)__W); |
45 | } |
46 | |
47 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
48 | _mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) { |
49 | return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, |
50 | (__v4di)_mm256_mullo_epi64(__A, __B), |
51 | (__v4di)_mm256_setzero_si256()); |
52 | } |
53 | |
54 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
55 | _mm_mullo_epi64 (__m128i __A, __m128i __B) { |
56 | return (__m128i) ((__v2du) __A * (__v2du) __B); |
57 | } |
58 | |
59 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
60 | _mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { |
61 | return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, |
62 | (__v2di)_mm_mullo_epi64(__A, __B), |
63 | (__v2di)__W); |
64 | } |
65 | |
66 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
67 | _mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) { |
68 | return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, |
69 | (__v2di)_mm_mullo_epi64(__A, __B), |
70 | (__v2di)_mm_setzero_si128()); |
71 | } |
72 | |
73 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
74 | _mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { |
75 | return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, |
76 | (__v4df)_mm256_andnot_pd(__A, __B), |
77 | (__v4df)__W); |
78 | } |
79 | |
80 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
81 | _mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) { |
82 | return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, |
83 | (__v4df)_mm256_andnot_pd(__A, __B), |
84 | (__v4df)_mm256_setzero_pd()); |
85 | } |
86 | |
87 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
88 | _mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { |
89 | return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, |
90 | (__v2df)_mm_andnot_pd(__A, __B), |
91 | (__v2df)__W); |
92 | } |
93 | |
94 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
95 | _mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) { |
96 | return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, |
97 | (__v2df)_mm_andnot_pd(__A, __B), |
98 | (__v2df)_mm_setzero_pd()); |
99 | } |
100 | |
101 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
102 | _mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { |
103 | return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, |
104 | (__v8sf)_mm256_andnot_ps(__A, __B), |
105 | (__v8sf)__W); |
106 | } |
107 | |
108 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
109 | _mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) { |
110 | return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, |
111 | (__v8sf)_mm256_andnot_ps(__A, __B), |
112 | (__v8sf)_mm256_setzero_ps()); |
113 | } |
114 | |
115 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
116 | _mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { |
117 | return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, |
118 | (__v4sf)_mm_andnot_ps(__A, __B), |
119 | (__v4sf)__W); |
120 | } |
121 | |
122 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
123 | _mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) { |
124 | return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, |
125 | (__v4sf)_mm_andnot_ps(__A, __B), |
126 | (__v4sf)_mm_setzero_ps()); |
127 | } |
128 | |
129 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
130 | _mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { |
131 | return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, |
132 | (__v4df)_mm256_and_pd(__A, __B), |
133 | (__v4df)__W); |
134 | } |
135 | |
136 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
137 | _mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) { |
138 | return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, |
139 | (__v4df)_mm256_and_pd(__A, __B), |
140 | (__v4df)_mm256_setzero_pd()); |
141 | } |
142 | |
143 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
144 | _mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { |
145 | return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, |
146 | (__v2df)_mm_and_pd(__A, __B), |
147 | (__v2df)__W); |
148 | } |
149 | |
150 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
151 | _mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) { |
152 | return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, |
153 | (__v2df)_mm_and_pd(__A, __B), |
154 | (__v2df)_mm_setzero_pd()); |
155 | } |
156 | |
157 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
158 | _mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { |
159 | return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, |
160 | (__v8sf)_mm256_and_ps(__A, __B), |
161 | (__v8sf)__W); |
162 | } |
163 | |
164 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
165 | _mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) { |
166 | return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, |
167 | (__v8sf)_mm256_and_ps(__A, __B), |
168 | (__v8sf)_mm256_setzero_ps()); |
169 | } |
170 | |
171 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
172 | _mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { |
173 | return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, |
174 | (__v4sf)_mm_and_ps(__A, __B), |
175 | (__v4sf)__W); |
176 | } |
177 | |
178 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
179 | _mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) { |
180 | return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, |
181 | (__v4sf)_mm_and_ps(__A, __B), |
182 | (__v4sf)_mm_setzero_ps()); |
183 | } |
184 | |
185 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
186 | _mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { |
187 | return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, |
188 | (__v4df)_mm256_xor_pd(__A, __B), |
189 | (__v4df)__W); |
190 | } |
191 | |
192 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
193 | _mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) { |
194 | return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, |
195 | (__v4df)_mm256_xor_pd(__A, __B), |
196 | (__v4df)_mm256_setzero_pd()); |
197 | } |
198 | |
199 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
200 | _mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { |
201 | return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, |
202 | (__v2df)_mm_xor_pd(__A, __B), |
203 | (__v2df)__W); |
204 | } |
205 | |
206 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
207 | _mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) { |
208 | return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, |
209 | (__v2df)_mm_xor_pd(__A, __B), |
210 | (__v2df)_mm_setzero_pd()); |
211 | } |
212 | |
213 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
214 | _mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { |
215 | return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, |
216 | (__v8sf)_mm256_xor_ps(__A, __B), |
217 | (__v8sf)__W); |
218 | } |
219 | |
220 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
221 | _mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) { |
222 | return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, |
223 | (__v8sf)_mm256_xor_ps(__A, __B), |
224 | (__v8sf)_mm256_setzero_ps()); |
225 | } |
226 | |
227 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
228 | _mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { |
229 | return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, |
230 | (__v4sf)_mm_xor_ps(__A, __B), |
231 | (__v4sf)__W); |
232 | } |
233 | |
234 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
235 | _mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) { |
236 | return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, |
237 | (__v4sf)_mm_xor_ps(__A, __B), |
238 | (__v4sf)_mm_setzero_ps()); |
239 | } |
240 | |
241 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
242 | _mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { |
243 | return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, |
244 | (__v4df)_mm256_or_pd(__A, __B), |
245 | (__v4df)__W); |
246 | } |
247 | |
248 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
249 | _mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) { |
250 | return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, |
251 | (__v4df)_mm256_or_pd(__A, __B), |
252 | (__v4df)_mm256_setzero_pd()); |
253 | } |
254 | |
255 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
256 | _mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { |
257 | return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, |
258 | (__v2df)_mm_or_pd(__A, __B), |
259 | (__v2df)__W); |
260 | } |
261 | |
262 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
263 | _mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) { |
264 | return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, |
265 | (__v2df)_mm_or_pd(__A, __B), |
266 | (__v2df)_mm_setzero_pd()); |
267 | } |
268 | |
269 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
270 | _mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { |
271 | return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, |
272 | (__v8sf)_mm256_or_ps(__A, __B), |
273 | (__v8sf)__W); |
274 | } |
275 | |
276 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
277 | _mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) { |
278 | return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, |
279 | (__v8sf)_mm256_or_ps(__A, __B), |
280 | (__v8sf)_mm256_setzero_ps()); |
281 | } |
282 | |
283 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
284 | _mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { |
285 | return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, |
286 | (__v4sf)_mm_or_ps(__A, __B), |
287 | (__v4sf)__W); |
288 | } |
289 | |
290 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
291 | _mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) { |
292 | return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, |
293 | (__v4sf)_mm_or_ps(__A, __B), |
294 | (__v4sf)_mm_setzero_ps()); |
295 | } |
296 | |
297 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
298 | _mm_cvtpd_epi64 (__m128d __A) { |
299 | return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, |
300 | (__v2di) _mm_setzero_si128(), |
301 | (__mmask8) -1); |
302 | } |
303 | |
304 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
305 | _mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) { |
306 | return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, |
307 | (__v2di) __W, |
308 | (__mmask8) __U); |
309 | } |
310 | |
311 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
312 | _mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A) { |
313 | return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, |
314 | (__v2di) _mm_setzero_si128(), |
315 | (__mmask8) __U); |
316 | } |
317 | |
318 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
319 | _mm256_cvtpd_epi64 (__m256d __A) { |
320 | return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, |
321 | (__v4di) _mm256_setzero_si256(), |
322 | (__mmask8) -1); |
323 | } |
324 | |
325 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
326 | _mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) { |
327 | return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, |
328 | (__v4di) __W, |
329 | (__mmask8) __U); |
330 | } |
331 | |
332 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
333 | _mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A) { |
334 | return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, |
335 | (__v4di) _mm256_setzero_si256(), |
336 | (__mmask8) __U); |
337 | } |
338 | |
339 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
340 | _mm_cvtpd_epu64 (__m128d __A) { |
341 | return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, |
342 | (__v2di) _mm_setzero_si128(), |
343 | (__mmask8) -1); |
344 | } |
345 | |
346 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
347 | _mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) { |
348 | return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, |
349 | (__v2di) __W, |
350 | (__mmask8) __U); |
351 | } |
352 | |
353 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
354 | _mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A) { |
355 | return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, |
356 | (__v2di) _mm_setzero_si128(), |
357 | (__mmask8) __U); |
358 | } |
359 | |
360 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
361 | _mm256_cvtpd_epu64 (__m256d __A) { |
362 | return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, |
363 | (__v4di) _mm256_setzero_si256(), |
364 | (__mmask8) -1); |
365 | } |
366 | |
367 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
368 | _mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) { |
369 | return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, |
370 | (__v4di) __W, |
371 | (__mmask8) __U); |
372 | } |
373 | |
374 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
375 | _mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A) { |
376 | return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, |
377 | (__v4di) _mm256_setzero_si256(), |
378 | (__mmask8) __U); |
379 | } |
380 | |
381 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
382 | _mm_cvtps_epi64 (__m128 __A) { |
383 | return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, |
384 | (__v2di) _mm_setzero_si128(), |
385 | (__mmask8) -1); |
386 | } |
387 | |
388 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
389 | _mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) { |
390 | return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, |
391 | (__v2di) __W, |
392 | (__mmask8) __U); |
393 | } |
394 | |
395 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
396 | _mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) { |
397 | return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, |
398 | (__v2di) _mm_setzero_si128(), |
399 | (__mmask8) __U); |
400 | } |
401 | |
402 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
403 | _mm256_cvtps_epi64 (__m128 __A) { |
404 | return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, |
405 | (__v4di) _mm256_setzero_si256(), |
406 | (__mmask8) -1); |
407 | } |
408 | |
409 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
410 | _mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) { |
411 | return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, |
412 | (__v4di) __W, |
413 | (__mmask8) __U); |
414 | } |
415 | |
416 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
417 | _mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) { |
418 | return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, |
419 | (__v4di) _mm256_setzero_si256(), |
420 | (__mmask8) __U); |
421 | } |
422 | |
423 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
424 | _mm_cvtps_epu64 (__m128 __A) { |
425 | return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, |
426 | (__v2di) _mm_setzero_si128(), |
427 | (__mmask8) -1); |
428 | } |
429 | |
430 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
431 | _mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) { |
432 | return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, |
433 | (__v2di) __W, |
434 | (__mmask8) __U); |
435 | } |
436 | |
437 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
438 | _mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) { |
439 | return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, |
440 | (__v2di) _mm_setzero_si128(), |
441 | (__mmask8) __U); |
442 | } |
443 | |
444 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
445 | _mm256_cvtps_epu64 (__m128 __A) { |
446 | return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, |
447 | (__v4di) _mm256_setzero_si256(), |
448 | (__mmask8) -1); |
449 | } |
450 | |
451 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
452 | _mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) { |
453 | return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, |
454 | (__v4di) __W, |
455 | (__mmask8) __U); |
456 | } |
457 | |
458 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
459 | _mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) { |
460 | return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, |
461 | (__v4di) _mm256_setzero_si256(), |
462 | (__mmask8) __U); |
463 | } |
464 | |
465 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
466 | _mm_cvtepi64_pd (__m128i __A) { |
467 | return (__m128d)__builtin_convertvector((__v2di)__A, __v2df); |
468 | } |
469 | |
470 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
471 | _mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) { |
472 | return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, |
473 | (__v2df)_mm_cvtepi64_pd(__A), |
474 | (__v2df)__W); |
475 | } |
476 | |
477 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
478 | _mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) { |
479 | return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, |
480 | (__v2df)_mm_cvtepi64_pd(__A), |
481 | (__v2df)_mm_setzero_pd()); |
482 | } |
483 | |
484 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
485 | _mm256_cvtepi64_pd (__m256i __A) { |
486 | return (__m256d)__builtin_convertvector((__v4di)__A, __v4df); |
487 | } |
488 | |
489 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
490 | _mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) { |
491 | return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, |
492 | (__v4df)_mm256_cvtepi64_pd(__A), |
493 | (__v4df)__W); |
494 | } |
495 | |
496 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
497 | _mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) { |
498 | return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, |
499 | (__v4df)_mm256_cvtepi64_pd(__A), |
500 | (__v4df)_mm256_setzero_pd()); |
501 | } |
502 | |
503 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
504 | _mm_cvtepi64_ps (__m128i __A) { |
505 | return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, |
506 | (__v4sf) _mm_setzero_ps(), |
507 | (__mmask8) -1); |
508 | } |
509 | |
510 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
511 | _mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A) { |
512 | return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, |
513 | (__v4sf) __W, |
514 | (__mmask8) __U); |
515 | } |
516 | |
517 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
518 | _mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) { |
519 | return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, |
520 | (__v4sf) _mm_setzero_ps(), |
521 | (__mmask8) __U); |
522 | } |
523 | |
524 | static __inline__ __m128 __DEFAULT_FN_ATTRS256 |
525 | _mm256_cvtepi64_ps (__m256i __A) { |
526 | return (__m128)__builtin_convertvector((__v4di)__A, __v4sf); |
527 | } |
528 | |
529 | static __inline__ __m128 __DEFAULT_FN_ATTRS256 |
530 | _mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) { |
531 | return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, |
532 | (__v4sf)_mm256_cvtepi64_ps(__A), |
533 | (__v4sf)__W); |
534 | } |
535 | |
536 | static __inline__ __m128 __DEFAULT_FN_ATTRS256 |
537 | _mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) { |
538 | return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, |
539 | (__v4sf)_mm256_cvtepi64_ps(__A), |
540 | (__v4sf)_mm_setzero_ps()); |
541 | } |
542 | |
543 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
544 | _mm_cvttpd_epi64 (__m128d __A) { |
545 | return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, |
546 | (__v2di) _mm_setzero_si128(), |
547 | (__mmask8) -1); |
548 | } |
549 | |
550 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
551 | _mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) { |
552 | return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, |
553 | (__v2di) __W, |
554 | (__mmask8) __U); |
555 | } |
556 | |
557 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
558 | _mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A) { |
559 | return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, |
560 | (__v2di) _mm_setzero_si128(), |
561 | (__mmask8) __U); |
562 | } |
563 | |
564 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
565 | _mm256_cvttpd_epi64 (__m256d __A) { |
566 | return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, |
567 | (__v4di) _mm256_setzero_si256(), |
568 | (__mmask8) -1); |
569 | } |
570 | |
571 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
572 | _mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) { |
573 | return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, |
574 | (__v4di) __W, |
575 | (__mmask8) __U); |
576 | } |
577 | |
578 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
579 | _mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A) { |
580 | return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, |
581 | (__v4di) _mm256_setzero_si256(), |
582 | (__mmask8) __U); |
583 | } |
584 | |
585 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
586 | _mm_cvttpd_epu64 (__m128d __A) { |
587 | return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, |
588 | (__v2di) _mm_setzero_si128(), |
589 | (__mmask8) -1); |
590 | } |
591 | |
592 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
593 | _mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) { |
594 | return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, |
595 | (__v2di) __W, |
596 | (__mmask8) __U); |
597 | } |
598 | |
599 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
600 | _mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A) { |
601 | return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, |
602 | (__v2di) _mm_setzero_si128(), |
603 | (__mmask8) __U); |
604 | } |
605 | |
606 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
607 | _mm256_cvttpd_epu64 (__m256d __A) { |
608 | return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, |
609 | (__v4di) _mm256_setzero_si256(), |
610 | (__mmask8) -1); |
611 | } |
612 | |
613 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
614 | _mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) { |
615 | return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, |
616 | (__v4di) __W, |
617 | (__mmask8) __U); |
618 | } |
619 | |
620 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
621 | _mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A) { |
622 | return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, |
623 | (__v4di) _mm256_setzero_si256(), |
624 | (__mmask8) __U); |
625 | } |
626 | |
627 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
628 | _mm_cvttps_epi64 (__m128 __A) { |
629 | return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, |
630 | (__v2di) _mm_setzero_si128(), |
631 | (__mmask8) -1); |
632 | } |
633 | |
634 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
635 | _mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) { |
636 | return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, |
637 | (__v2di) __W, |
638 | (__mmask8) __U); |
639 | } |
640 | |
641 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
642 | _mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) { |
643 | return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, |
644 | (__v2di) _mm_setzero_si128(), |
645 | (__mmask8) __U); |
646 | } |
647 | |
648 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
649 | _mm256_cvttps_epi64 (__m128 __A) { |
650 | return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, |
651 | (__v4di) _mm256_setzero_si256(), |
652 | (__mmask8) -1); |
653 | } |
654 | |
655 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
656 | _mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) { |
657 | return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, |
658 | (__v4di) __W, |
659 | (__mmask8) __U); |
660 | } |
661 | |
662 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
663 | _mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) { |
664 | return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, |
665 | (__v4di) _mm256_setzero_si256(), |
666 | (__mmask8) __U); |
667 | } |
668 | |
669 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
670 | _mm_cvttps_epu64 (__m128 __A) { |
671 | return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, |
672 | (__v2di) _mm_setzero_si128(), |
673 | (__mmask8) -1); |
674 | } |
675 | |
676 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
677 | _mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) { |
678 | return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, |
679 | (__v2di) __W, |
680 | (__mmask8) __U); |
681 | } |
682 | |
683 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
684 | _mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) { |
685 | return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, |
686 | (__v2di) _mm_setzero_si128(), |
687 | (__mmask8) __U); |
688 | } |
689 | |
690 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
691 | _mm256_cvttps_epu64 (__m128 __A) { |
692 | return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, |
693 | (__v4di) _mm256_setzero_si256(), |
694 | (__mmask8) -1); |
695 | } |
696 | |
697 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
698 | _mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) { |
699 | return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, |
700 | (__v4di) __W, |
701 | (__mmask8) __U); |
702 | } |
703 | |
704 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
705 | _mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) { |
706 | return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, |
707 | (__v4di) _mm256_setzero_si256(), |
708 | (__mmask8) __U); |
709 | } |
710 | |
711 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
712 | _mm_cvtepu64_pd (__m128i __A) { |
713 | return (__m128d)__builtin_convertvector((__v2du)__A, __v2df); |
714 | } |
715 | |
716 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
717 | _mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) { |
718 | return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, |
719 | (__v2df)_mm_cvtepu64_pd(__A), |
720 | (__v2df)__W); |
721 | } |
722 | |
723 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
724 | _mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) { |
725 | return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, |
726 | (__v2df)_mm_cvtepu64_pd(__A), |
727 | (__v2df)_mm_setzero_pd()); |
728 | } |
729 | |
730 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
731 | _mm256_cvtepu64_pd (__m256i __A) { |
732 | return (__m256d)__builtin_convertvector((__v4du)__A, __v4df); |
733 | } |
734 | |
735 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
736 | _mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) { |
737 | return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, |
738 | (__v4df)_mm256_cvtepu64_pd(__A), |
739 | (__v4df)__W); |
740 | } |
741 | |
742 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
743 | _mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) { |
744 | return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, |
745 | (__v4df)_mm256_cvtepu64_pd(__A), |
746 | (__v4df)_mm256_setzero_pd()); |
747 | } |
748 | |
749 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
750 | _mm_cvtepu64_ps (__m128i __A) { |
751 | return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, |
752 | (__v4sf) _mm_setzero_ps(), |
753 | (__mmask8) -1); |
754 | } |
755 | |
756 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
757 | _mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A) { |
758 | return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, |
759 | (__v4sf) __W, |
760 | (__mmask8) __U); |
761 | } |
762 | |
763 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
764 | _mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) { |
765 | return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, |
766 | (__v4sf) _mm_setzero_ps(), |
767 | (__mmask8) __U); |
768 | } |
769 | |
770 | static __inline__ __m128 __DEFAULT_FN_ATTRS256 |
771 | _mm256_cvtepu64_ps (__m256i __A) { |
772 | return (__m128)__builtin_convertvector((__v4du)__A, __v4sf); |
773 | } |
774 | |
775 | static __inline__ __m128 __DEFAULT_FN_ATTRS256 |
776 | _mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) { |
777 | return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, |
778 | (__v4sf)_mm256_cvtepu64_ps(__A), |
779 | (__v4sf)__W); |
780 | } |
781 | |
782 | static __inline__ __m128 __DEFAULT_FN_ATTRS256 |
783 | _mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) { |
784 | return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, |
785 | (__v4sf)_mm256_cvtepu64_ps(__A), |
786 | (__v4sf)_mm_setzero_ps()); |
787 | } |
788 | |
789 | #define _mm_range_pd(A, B, C) \ |
790 | (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ |
791 | (__v2df)(__m128d)(B), (int)(C), \ |
792 | (__v2df)_mm_setzero_pd(), \ |
793 | (__mmask8)-1) |
794 | |
795 | #define _mm_mask_range_pd(W, U, A, B, C) \ |
796 | (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ |
797 | (__v2df)(__m128d)(B), (int)(C), \ |
798 | (__v2df)(__m128d)(W), \ |
799 | (__mmask8)(U)) |
800 | |
801 | #define _mm_maskz_range_pd(U, A, B, C) \ |
802 | (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ |
803 | (__v2df)(__m128d)(B), (int)(C), \ |
804 | (__v2df)_mm_setzero_pd(), \ |
805 | (__mmask8)(U)) |
806 | |
807 | #define _mm256_range_pd(A, B, C) \ |
808 | (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ |
809 | (__v4df)(__m256d)(B), (int)(C), \ |
810 | (__v4df)_mm256_setzero_pd(), \ |
811 | (__mmask8)-1) |
812 | |
813 | #define _mm256_mask_range_pd(W, U, A, B, C) \ |
814 | (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ |
815 | (__v4df)(__m256d)(B), (int)(C), \ |
816 | (__v4df)(__m256d)(W), \ |
817 | (__mmask8)(U)) |
818 | |
819 | #define _mm256_maskz_range_pd(U, A, B, C) \ |
820 | (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ |
821 | (__v4df)(__m256d)(B), (int)(C), \ |
822 | (__v4df)_mm256_setzero_pd(), \ |
823 | (__mmask8)(U)) |
824 | |
825 | #define _mm_range_ps(A, B, C) \ |
826 | (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ |
827 | (__v4sf)(__m128)(B), (int)(C), \ |
828 | (__v4sf)_mm_setzero_ps(), \ |
829 | (__mmask8)-1) |
830 | |
831 | #define _mm_mask_range_ps(W, U, A, B, C) \ |
832 | (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ |
833 | (__v4sf)(__m128)(B), (int)(C), \ |
834 | (__v4sf)(__m128)(W), (__mmask8)(U)) |
835 | |
836 | #define _mm_maskz_range_ps(U, A, B, C) \ |
837 | (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ |
838 | (__v4sf)(__m128)(B), (int)(C), \ |
839 | (__v4sf)_mm_setzero_ps(), \ |
840 | (__mmask8)(U)) |
841 | |
842 | #define _mm256_range_ps(A, B, C) \ |
843 | (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ |
844 | (__v8sf)(__m256)(B), (int)(C), \ |
845 | (__v8sf)_mm256_setzero_ps(), \ |
846 | (__mmask8)-1) |
847 | |
848 | #define _mm256_mask_range_ps(W, U, A, B, C) \ |
849 | (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ |
850 | (__v8sf)(__m256)(B), (int)(C), \ |
851 | (__v8sf)(__m256)(W), (__mmask8)(U)) |
852 | |
853 | #define _mm256_maskz_range_ps(U, A, B, C) \ |
854 | (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ |
855 | (__v8sf)(__m256)(B), (int)(C), \ |
856 | (__v8sf)_mm256_setzero_ps(), \ |
857 | (__mmask8)(U)) |
858 | |
859 | #define _mm_reduce_pd(A, B) \ |
860 | (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ |
861 | (__v2df)_mm_setzero_pd(), \ |
862 | (__mmask8)-1) |
863 | |
864 | #define _mm_mask_reduce_pd(W, U, A, B) \ |
865 | (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ |
866 | (__v2df)(__m128d)(W), \ |
867 | (__mmask8)(U)) |
868 | |
869 | #define _mm_maskz_reduce_pd(U, A, B) \ |
870 | (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ |
871 | (__v2df)_mm_setzero_pd(), \ |
872 | (__mmask8)(U)) |
873 | |
874 | #define _mm256_reduce_pd(A, B) \ |
875 | (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ |
876 | (__v4df)_mm256_setzero_pd(), \ |
877 | (__mmask8)-1) |
878 | |
879 | #define _mm256_mask_reduce_pd(W, U, A, B) \ |
880 | (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ |
881 | (__v4df)(__m256d)(W), \ |
882 | (__mmask8)(U)) |
883 | |
884 | #define _mm256_maskz_reduce_pd(U, A, B) \ |
885 | (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ |
886 | (__v4df)_mm256_setzero_pd(), \ |
887 | (__mmask8)(U)) |
888 | |
889 | #define _mm_reduce_ps(A, B) \ |
890 | (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ |
891 | (__v4sf)_mm_setzero_ps(), \ |
892 | (__mmask8)-1) |
893 | |
894 | #define _mm_mask_reduce_ps(W, U, A, B) \ |
895 | (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ |
896 | (__v4sf)(__m128)(W), \ |
897 | (__mmask8)(U)) |
898 | |
899 | #define _mm_maskz_reduce_ps(U, A, B) \ |
900 | (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ |
901 | (__v4sf)_mm_setzero_ps(), \ |
902 | (__mmask8)(U)) |
903 | |
904 | #define _mm256_reduce_ps(A, B) \ |
905 | (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ |
906 | (__v8sf)_mm256_setzero_ps(), \ |
907 | (__mmask8)-1) |
908 | |
909 | #define _mm256_mask_reduce_ps(W, U, A, B) \ |
910 | (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ |
911 | (__v8sf)(__m256)(W), \ |
912 | (__mmask8)(U)) |
913 | |
914 | #define _mm256_maskz_reduce_ps(U, A, B) \ |
915 | (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ |
916 | (__v8sf)_mm256_setzero_ps(), \ |
917 | (__mmask8)(U)) |
918 | |
919 | static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 |
920 | _mm_movepi32_mask (__m128i __A) |
921 | { |
922 | return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A); |
923 | } |
924 | |
925 | static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 |
926 | _mm256_movepi32_mask (__m256i __A) |
927 | { |
928 | return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A); |
929 | } |
930 | |
931 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
932 | _mm_movm_epi32 (__mmask8 __A) |
933 | { |
934 | return (__m128i) __builtin_ia32_cvtmask2d128 (__A); |
935 | } |
936 | |
937 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
938 | _mm256_movm_epi32 (__mmask8 __A) |
939 | { |
940 | return (__m256i) __builtin_ia32_cvtmask2d256 (__A); |
941 | } |
942 | |
943 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
944 | _mm_movm_epi64 (__mmask8 __A) |
945 | { |
946 | return (__m128i) __builtin_ia32_cvtmask2q128 (__A); |
947 | } |
948 | |
949 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
950 | _mm256_movm_epi64 (__mmask8 __A) |
951 | { |
952 | return (__m256i) __builtin_ia32_cvtmask2q256 (__A); |
953 | } |
954 | |
955 | static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 |
956 | _mm_movepi64_mask (__m128i __A) |
957 | { |
958 | return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A); |
959 | } |
960 | |
961 | static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 |
962 | _mm256_movepi64_mask (__m256i __A) |
963 | { |
964 | return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A); |
965 | } |
966 | |
967 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
968 | _mm256_broadcast_f32x2 (__m128 __A) |
969 | { |
970 | return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, |
971 | 0, 1, 0, 1, 0, 1, 0, 1); |
972 | } |
973 | |
974 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
975 | _mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A) |
976 | { |
977 | return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, |
978 | (__v8sf)_mm256_broadcast_f32x2(__A), |
979 | (__v8sf)__O); |
980 | } |
981 | |
982 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
983 | _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A) |
984 | { |
985 | return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, |
986 | (__v8sf)_mm256_broadcast_f32x2(__A), |
987 | (__v8sf)_mm256_setzero_ps()); |
988 | } |
989 | |
990 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
991 | _mm256_broadcast_f64x2(__m128d __A) |
992 | { |
993 | return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A, |
994 | 0, 1, 0, 1); |
995 | } |
996 | |
997 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
998 | _mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) |
999 | { |
1000 | return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, |
1001 | (__v4df)_mm256_broadcast_f64x2(__A), |
1002 | (__v4df)__O); |
1003 | } |
1004 | |
1005 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
1006 | _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) |
1007 | { |
1008 | return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, |
1009 | (__v4df)_mm256_broadcast_f64x2(__A), |
1010 | (__v4df)_mm256_setzero_pd()); |
1011 | } |
1012 | |
1013 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
1014 | _mm_broadcast_i32x2 (__m128i __A) |
1015 | { |
1016 | return (__m128i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, |
1017 | 0, 1, 0, 1); |
1018 | } |
1019 | |
1020 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
1021 | _mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A) |
1022 | { |
1023 | return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, |
1024 | (__v4si)_mm_broadcast_i32x2(__A), |
1025 | (__v4si)__O); |
1026 | } |
1027 | |
1028 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
1029 | _mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) |
1030 | { |
1031 | return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, |
1032 | (__v4si)_mm_broadcast_i32x2(__A), |
1033 | (__v4si)_mm_setzero_si128()); |
1034 | } |
1035 | |
1036 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
1037 | _mm256_broadcast_i32x2 (__m128i __A) |
1038 | { |
1039 | return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, |
1040 | 0, 1, 0, 1, 0, 1, 0, 1); |
1041 | } |
1042 | |
1043 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
1044 | _mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A) |
1045 | { |
1046 | return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, |
1047 | (__v8si)_mm256_broadcast_i32x2(__A), |
1048 | (__v8si)__O); |
1049 | } |
1050 | |
1051 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
1052 | _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) |
1053 | { |
1054 | return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, |
1055 | (__v8si)_mm256_broadcast_i32x2(__A), |
1056 | (__v8si)_mm256_setzero_si256()); |
1057 | } |
1058 | |
1059 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
1060 | _mm256_broadcast_i64x2(__m128i __A) |
1061 | { |
1062 | return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A, |
1063 | 0, 1, 0, 1); |
1064 | } |
1065 | |
1066 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
1067 | _mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) |
1068 | { |
1069 | return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, |
1070 | (__v4di)_mm256_broadcast_i64x2(__A), |
1071 | (__v4di)__O); |
1072 | } |
1073 | |
1074 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
1075 | _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) |
1076 | { |
1077 | return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, |
1078 | (__v4di)_mm256_broadcast_i64x2(__A), |
1079 | (__v4di)_mm256_setzero_si256()); |
1080 | } |
1081 | |
1082 | #define (A, imm) \ |
1083 | (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ |
1084 | (int)(imm), \ |
1085 | (__v2df)_mm_undefined_pd(), \ |
1086 | (__mmask8)-1) |
1087 | |
1088 | #define (W, U, A, imm) \ |
1089 | (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ |
1090 | (int)(imm), \ |
1091 | (__v2df)(__m128d)(W), \ |
1092 | (__mmask8)(U)) |
1093 | |
1094 | #define (U, A, imm) \ |
1095 | (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ |
1096 | (int)(imm), \ |
1097 | (__v2df)_mm_setzero_pd(), \ |
1098 | (__mmask8)(U)) |
1099 | |
1100 | #define (A, imm) \ |
1101 | (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ |
1102 | (int)(imm), \ |
1103 | (__v2di)_mm_undefined_si128(), \ |
1104 | (__mmask8)-1) |
1105 | |
1106 | #define (W, U, A, imm) \ |
1107 | (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ |
1108 | (int)(imm), \ |
1109 | (__v2di)(__m128i)(W), \ |
1110 | (__mmask8)(U)) |
1111 | |
1112 | #define (U, A, imm) \ |
1113 | (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ |
1114 | (int)(imm), \ |
1115 | (__v2di)_mm_setzero_si128(), \ |
1116 | (__mmask8)(U)) |
1117 | |
1118 | #define _mm256_insertf64x2(A, B, imm) \ |
1119 | (__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \ |
1120 | (__v2df)(__m128d)(B), (int)(imm)) |
1121 | |
1122 | #define _mm256_mask_insertf64x2(W, U, A, B, imm) \ |
1123 | (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ |
1124 | (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ |
1125 | (__v4df)(__m256d)(W)) |
1126 | |
1127 | #define _mm256_maskz_insertf64x2(U, A, B, imm) \ |
1128 | (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ |
1129 | (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ |
1130 | (__v4df)_mm256_setzero_pd()) |
1131 | |
1132 | #define _mm256_inserti64x2(A, B, imm) \ |
1133 | (__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \ |
1134 | (__v2di)(__m128i)(B), (int)(imm)) |
1135 | |
1136 | #define _mm256_mask_inserti64x2(W, U, A, B, imm) \ |
1137 | (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ |
1138 | (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ |
1139 | (__v4di)(__m256i)(W)) |
1140 | |
1141 | #define _mm256_maskz_inserti64x2(U, A, B, imm) \ |
1142 | (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ |
1143 | (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ |
1144 | (__v4di)_mm256_setzero_si256()) |
1145 | |
1146 | #define _mm_mask_fpclass_pd_mask(U, A, imm) \ |
1147 | (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ |
1148 | (__mmask8)(U)) |
1149 | |
1150 | #define _mm_fpclass_pd_mask(A, imm) \ |
1151 | (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ |
1152 | (__mmask8)-1) |
1153 | |
1154 | #define _mm256_mask_fpclass_pd_mask(U, A, imm) \ |
1155 | (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ |
1156 | (__mmask8)(U)) |
1157 | |
1158 | #define _mm256_fpclass_pd_mask(A, imm) \ |
1159 | (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ |
1160 | (__mmask8)-1) |
1161 | |
1162 | #define _mm_mask_fpclass_ps_mask(U, A, imm) \ |
1163 | (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ |
1164 | (__mmask8)(U)) |
1165 | |
1166 | #define _mm_fpclass_ps_mask(A, imm) \ |
1167 | (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ |
1168 | (__mmask8)-1) |
1169 | |
1170 | #define _mm256_mask_fpclass_ps_mask(U, A, imm) \ |
1171 | (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ |
1172 | (__mmask8)(U)) |
1173 | |
1174 | #define _mm256_fpclass_ps_mask(A, imm) \ |
1175 | (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ |
1176 | (__mmask8)-1) |
1177 | |
1178 | #undef __DEFAULT_FN_ATTRS128 |
1179 | #undef __DEFAULT_FN_ATTRS256 |
1180 | |
1181 | #endif |
1182 | |