1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | |
15 | |
16 | |
17 | |
18 | |
19 | |
20 | |
21 | |
22 | |
23 | |
24 | #ifndef __IMMINTRIN_H |
25 | #error "Never use <avx512vlvbmi2intrin.h> directly; include <immintrin.h> instead." |
26 | #endif |
27 | |
28 | #ifndef __AVX512VLVBMI2INTRIN_H |
29 | #define __AVX512VLVBMI2INTRIN_H |
30 | |
31 | |
32 | #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(128))) |
33 | #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(256))) |
34 | |
35 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
36 | _mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D) |
37 | { |
38 | return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D, |
39 | (__v8hi) __S, |
40 | __U); |
41 | } |
42 | |
43 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
44 | _mm_maskz_compress_epi16(__mmask8 __U, __m128i __D) |
45 | { |
46 | return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D, |
47 | (__v8hi) _mm_setzero_si128(), |
48 | __U); |
49 | } |
50 | |
51 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
52 | _mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D) |
53 | { |
54 | return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D, |
55 | (__v16qi) __S, |
56 | __U); |
57 | } |
58 | |
59 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
60 | _mm_maskz_compress_epi8(__mmask16 __U, __m128i __D) |
61 | { |
62 | return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D, |
63 | (__v16qi) _mm_setzero_si128(), |
64 | __U); |
65 | } |
66 | |
67 | static __inline__ void __DEFAULT_FN_ATTRS128 |
68 | _mm_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D) |
69 | { |
70 | __builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D, |
71 | __U); |
72 | } |
73 | |
74 | static __inline__ void __DEFAULT_FN_ATTRS128 |
75 | _mm_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D) |
76 | { |
77 | __builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D, |
78 | __U); |
79 | } |
80 | |
81 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
82 | _mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D) |
83 | { |
84 | return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D, |
85 | (__v8hi) __S, |
86 | __U); |
87 | } |
88 | |
89 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
90 | _mm_maskz_expand_epi16(__mmask8 __U, __m128i __D) |
91 | { |
92 | return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D, |
93 | (__v8hi) _mm_setzero_si128(), |
94 | __U); |
95 | } |
96 | |
97 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
98 | _mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D) |
99 | { |
100 | return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D, |
101 | (__v16qi) __S, |
102 | __U); |
103 | } |
104 | |
105 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
106 | _mm_maskz_expand_epi8(__mmask16 __U, __m128i __D) |
107 | { |
108 | return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D, |
109 | (__v16qi) _mm_setzero_si128(), |
110 | __U); |
111 | } |
112 | |
113 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
114 | _mm_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P) |
115 | { |
116 | return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P, |
117 | (__v8hi) __S, |
118 | __U); |
119 | } |
120 | |
121 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
122 | _mm_maskz_expandloadu_epi16(__mmask8 __U, void const *__P) |
123 | { |
124 | return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P, |
125 | (__v8hi) _mm_setzero_si128(), |
126 | __U); |
127 | } |
128 | |
129 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
130 | _mm_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P) |
131 | { |
132 | return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P, |
133 | (__v16qi) __S, |
134 | __U); |
135 | } |
136 | |
137 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
138 | _mm_maskz_expandloadu_epi8(__mmask16 __U, void const *__P) |
139 | { |
140 | return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P, |
141 | (__v16qi) _mm_setzero_si128(), |
142 | __U); |
143 | } |
144 | |
145 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
146 | _mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D) |
147 | { |
148 | return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D, |
149 | (__v16hi) __S, |
150 | __U); |
151 | } |
152 | |
153 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
154 | _mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D) |
155 | { |
156 | return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D, |
157 | (__v16hi) _mm256_setzero_si256(), |
158 | __U); |
159 | } |
160 | |
161 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
162 | _mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D) |
163 | { |
164 | return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D, |
165 | (__v32qi) __S, |
166 | __U); |
167 | } |
168 | |
169 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
170 | _mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D) |
171 | { |
172 | return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D, |
173 | (__v32qi) _mm256_setzero_si256(), |
174 | __U); |
175 | } |
176 | |
177 | static __inline__ void __DEFAULT_FN_ATTRS256 |
178 | _mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D) |
179 | { |
180 | __builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D, |
181 | __U); |
182 | } |
183 | |
184 | static __inline__ void __DEFAULT_FN_ATTRS256 |
185 | _mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D) |
186 | { |
187 | __builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D, |
188 | __U); |
189 | } |
190 | |
191 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
192 | _mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D) |
193 | { |
194 | return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D, |
195 | (__v16hi) __S, |
196 | __U); |
197 | } |
198 | |
199 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
200 | _mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D) |
201 | { |
202 | return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D, |
203 | (__v16hi) _mm256_setzero_si256(), |
204 | __U); |
205 | } |
206 | |
207 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
208 | _mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D) |
209 | { |
210 | return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D, |
211 | (__v32qi) __S, |
212 | __U); |
213 | } |
214 | |
215 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
216 | _mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D) |
217 | { |
218 | return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D, |
219 | (__v32qi) _mm256_setzero_si256(), |
220 | __U); |
221 | } |
222 | |
223 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
224 | _mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P) |
225 | { |
226 | return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P, |
227 | (__v16hi) __S, |
228 | __U); |
229 | } |
230 | |
231 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
232 | _mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P) |
233 | { |
234 | return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P, |
235 | (__v16hi) _mm256_setzero_si256(), |
236 | __U); |
237 | } |
238 | |
239 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
240 | _mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P) |
241 | { |
242 | return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P, |
243 | (__v32qi) __S, |
244 | __U); |
245 | } |
246 | |
247 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
248 | _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) |
249 | { |
250 | return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P, |
251 | (__v32qi) _mm256_setzero_si256(), |
252 | __U); |
253 | } |
254 | |
255 | #define _mm256_shldi_epi64(A, B, I) \ |
256 | (__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \ |
257 | (__v4di)(__m256i)(B), (int)(I)) |
258 | |
259 | #define _mm256_mask_shldi_epi64(S, U, A, B, I) \ |
260 | (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ |
261 | (__v4di)_mm256_shldi_epi64((A), (B), (I)), \ |
262 | (__v4di)(__m256i)(S)) |
263 | |
264 | #define _mm256_maskz_shldi_epi64(U, A, B, I) \ |
265 | (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ |
266 | (__v4di)_mm256_shldi_epi64((A), (B), (I)), \ |
267 | (__v4di)_mm256_setzero_si256()) |
268 | |
269 | #define _mm_shldi_epi64(A, B, I) \ |
270 | (__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \ |
271 | (__v2di)(__m128i)(B), (int)(I)) |
272 | |
273 | #define _mm_mask_shldi_epi64(S, U, A, B, I) \ |
274 | (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ |
275 | (__v2di)_mm_shldi_epi64((A), (B), (I)), \ |
276 | (__v2di)(__m128i)(S)) |
277 | |
278 | #define _mm_maskz_shldi_epi64(U, A, B, I) \ |
279 | (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ |
280 | (__v2di)_mm_shldi_epi64((A), (B), (I)), \ |
281 | (__v2di)_mm_setzero_si128()) |
282 | |
283 | #define _mm256_shldi_epi32(A, B, I) \ |
284 | (__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \ |
285 | (__v8si)(__m256i)(B), (int)(I)) |
286 | |
287 | #define _mm256_mask_shldi_epi32(S, U, A, B, I) \ |
288 | (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ |
289 | (__v8si)_mm256_shldi_epi32((A), (B), (I)), \ |
290 | (__v8si)(__m256i)(S)) |
291 | |
292 | #define _mm256_maskz_shldi_epi32(U, A, B, I) \ |
293 | (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ |
294 | (__v8si)_mm256_shldi_epi32((A), (B), (I)), \ |
295 | (__v8si)_mm256_setzero_si256()) |
296 | |
297 | #define _mm_shldi_epi32(A, B, I) \ |
298 | (__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \ |
299 | (__v4si)(__m128i)(B), (int)(I)) |
300 | |
301 | #define _mm_mask_shldi_epi32(S, U, A, B, I) \ |
302 | (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ |
303 | (__v4si)_mm_shldi_epi32((A), (B), (I)), \ |
304 | (__v4si)(__m128i)(S)) |
305 | |
306 | #define _mm_maskz_shldi_epi32(U, A, B, I) \ |
307 | (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ |
308 | (__v4si)_mm_shldi_epi32((A), (B), (I)), \ |
309 | (__v4si)_mm_setzero_si128()) |
310 | |
311 | #define _mm256_shldi_epi16(A, B, I) \ |
312 | (__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \ |
313 | (__v16hi)(__m256i)(B), (int)(I)) |
314 | |
315 | #define _mm256_mask_shldi_epi16(S, U, A, B, I) \ |
316 | (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ |
317 | (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ |
318 | (__v16hi)(__m256i)(S)) |
319 | |
320 | #define _mm256_maskz_shldi_epi16(U, A, B, I) \ |
321 | (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ |
322 | (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ |
323 | (__v16hi)_mm256_setzero_si256()) |
324 | |
325 | #define _mm_shldi_epi16(A, B, I) \ |
326 | (__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \ |
327 | (__v8hi)(__m128i)(B), (int)(I)) |
328 | |
329 | #define _mm_mask_shldi_epi16(S, U, A, B, I) \ |
330 | (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ |
331 | (__v8hi)_mm_shldi_epi16((A), (B), (I)), \ |
332 | (__v8hi)(__m128i)(S)) |
333 | |
334 | #define _mm_maskz_shldi_epi16(U, A, B, I) \ |
335 | (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ |
336 | (__v8hi)_mm_shldi_epi16((A), (B), (I)), \ |
337 | (__v8hi)_mm_setzero_si128()) |
338 | |
339 | #define _mm256_shrdi_epi64(A, B, I) \ |
340 | (__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \ |
341 | (__v4di)(__m256i)(B), (int)(I)) |
342 | |
343 | #define _mm256_mask_shrdi_epi64(S, U, A, B, I) \ |
344 | (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ |
345 | (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ |
346 | (__v4di)(__m256i)(S)) |
347 | |
348 | #define _mm256_maskz_shrdi_epi64(U, A, B, I) \ |
349 | (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ |
350 | (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ |
351 | (__v4di)_mm256_setzero_si256()) |
352 | |
353 | #define _mm_shrdi_epi64(A, B, I) \ |
354 | (__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \ |
355 | (__v2di)(__m128i)(B), (int)(I)) |
356 | |
357 | #define _mm_mask_shrdi_epi64(S, U, A, B, I) \ |
358 | (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ |
359 | (__v2di)_mm_shrdi_epi64((A), (B), (I)), \ |
360 | (__v2di)(__m128i)(S)) |
361 | |
362 | #define _mm_maskz_shrdi_epi64(U, A, B, I) \ |
363 | (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ |
364 | (__v2di)_mm_shrdi_epi64((A), (B), (I)), \ |
365 | (__v2di)_mm_setzero_si128()) |
366 | |
367 | #define _mm256_shrdi_epi32(A, B, I) \ |
368 | (__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \ |
369 | (__v8si)(__m256i)(B), (int)(I)) |
370 | |
371 | #define _mm256_mask_shrdi_epi32(S, U, A, B, I) \ |
372 | (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ |
373 | (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ |
374 | (__v8si)(__m256i)(S)) |
375 | |
376 | #define _mm256_maskz_shrdi_epi32(U, A, B, I) \ |
377 | (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ |
378 | (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ |
379 | (__v8si)_mm256_setzero_si256()) |
380 | |
381 | #define _mm_shrdi_epi32(A, B, I) \ |
382 | (__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \ |
383 | (__v4si)(__m128i)(B), (int)(I)) |
384 | |
385 | #define _mm_mask_shrdi_epi32(S, U, A, B, I) \ |
386 | (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ |
387 | (__v4si)_mm_shrdi_epi32((A), (B), (I)), \ |
388 | (__v4si)(__m128i)(S)) |
389 | |
390 | #define _mm_maskz_shrdi_epi32(U, A, B, I) \ |
391 | (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ |
392 | (__v4si)_mm_shrdi_epi32((A), (B), (I)), \ |
393 | (__v4si)_mm_setzero_si128()) |
394 | |
395 | #define _mm256_shrdi_epi16(A, B, I) \ |
396 | (__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \ |
397 | (__v16hi)(__m256i)(B), (int)(I)) |
398 | |
399 | #define _mm256_mask_shrdi_epi16(S, U, A, B, I) \ |
400 | (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ |
401 | (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ |
402 | (__v16hi)(__m256i)(S)) |
403 | |
404 | #define _mm256_maskz_shrdi_epi16(U, A, B, I) \ |
405 | (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ |
406 | (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ |
407 | (__v16hi)_mm256_setzero_si256()) |
408 | |
409 | #define _mm_shrdi_epi16(A, B, I) \ |
410 | (__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \ |
411 | (__v8hi)(__m128i)(B), (int)(I)) |
412 | |
413 | #define _mm_mask_shrdi_epi16(S, U, A, B, I) \ |
414 | (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ |
415 | (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ |
416 | (__v8hi)(__m128i)(S)) |
417 | |
418 | #define _mm_maskz_shrdi_epi16(U, A, B, I) \ |
419 | (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ |
420 | (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ |
421 | (__v8hi)_mm_setzero_si128()) |
422 | |
423 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
424 | _mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C) |
425 | { |
426 | return (__m256i)__builtin_ia32_vpshldvq256((__v4di)__A, (__v4di)__B, |
427 | (__v4di)__C); |
428 | } |
429 | |
430 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
431 | _mm256_mask_shldv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) |
432 | { |
433 | return (__m256i)__builtin_ia32_selectq_256(__U, |
434 | (__v4di)_mm256_shldv_epi64(__A, __B, __C), |
435 | (__v4di)__A); |
436 | } |
437 | |
438 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
439 | _mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) |
440 | { |
441 | return (__m256i)__builtin_ia32_selectq_256(__U, |
442 | (__v4di)_mm256_shldv_epi64(__A, __B, __C), |
443 | (__v4di)_mm256_setzero_si256()); |
444 | } |
445 | |
446 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
447 | _mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C) |
448 | { |
449 | return (__m128i)__builtin_ia32_vpshldvq128((__v2di)__A, (__v2di)__B, |
450 | (__v2di)__C); |
451 | } |
452 | |
453 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
454 | _mm_mask_shldv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) |
455 | { |
456 | return (__m128i)__builtin_ia32_selectq_128(__U, |
457 | (__v2di)_mm_shldv_epi64(__A, __B, __C), |
458 | (__v2di)__A); |
459 | } |
460 | |
461 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
462 | _mm_maskz_shldv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) |
463 | { |
464 | return (__m128i)__builtin_ia32_selectq_128(__U, |
465 | (__v2di)_mm_shldv_epi64(__A, __B, __C), |
466 | (__v2di)_mm_setzero_si128()); |
467 | } |
468 | |
469 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
470 | _mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C) |
471 | { |
472 | return (__m256i)__builtin_ia32_vpshldvd256((__v8si)__A, (__v8si)__B, |
473 | (__v8si)__C); |
474 | } |
475 | |
476 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
477 | _mm256_mask_shldv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) |
478 | { |
479 | return (__m256i)__builtin_ia32_selectd_256(__U, |
480 | (__v8si)_mm256_shldv_epi32(__A, __B, __C), |
481 | (__v8si)__A); |
482 | } |
483 | |
484 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
485 | _mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) |
486 | { |
487 | return (__m256i)__builtin_ia32_selectd_256(__U, |
488 | (__v8si)_mm256_shldv_epi32(__A, __B, __C), |
489 | (__v8si)_mm256_setzero_si256()); |
490 | } |
491 | |
492 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
493 | _mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C) |
494 | { |
495 | return (__m128i)__builtin_ia32_vpshldvd128((__v4si)__A, (__v4si)__B, |
496 | (__v4si)__C); |
497 | } |
498 | |
499 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
500 | _mm_mask_shldv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) |
501 | { |
502 | return (__m128i)__builtin_ia32_selectd_128(__U, |
503 | (__v4si)_mm_shldv_epi32(__A, __B, __C), |
504 | (__v4si)__A); |
505 | } |
506 | |
507 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
508 | _mm_maskz_shldv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) |
509 | { |
510 | return (__m128i)__builtin_ia32_selectd_128(__U, |
511 | (__v4si)_mm_shldv_epi32(__A, __B, __C), |
512 | (__v4si)_mm_setzero_si128()); |
513 | } |
514 | |
515 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
516 | _mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C) |
517 | { |
518 | return (__m256i)__builtin_ia32_vpshldvw256((__v16hi)__A, (__v16hi)__B, |
519 | (__v16hi)__C); |
520 | } |
521 | |
522 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
523 | _mm256_mask_shldv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C) |
524 | { |
525 | return (__m256i)__builtin_ia32_selectw_256(__U, |
526 | (__v16hi)_mm256_shldv_epi16(__A, __B, __C), |
527 | (__v16hi)__A); |
528 | } |
529 | |
530 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
531 | _mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C) |
532 | { |
533 | return (__m256i)__builtin_ia32_selectw_256(__U, |
534 | (__v16hi)_mm256_shldv_epi16(__A, __B, __C), |
535 | (__v16hi)_mm256_setzero_si256()); |
536 | } |
537 | |
538 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
539 | _mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C) |
540 | { |
541 | return (__m128i)__builtin_ia32_vpshldvw128((__v8hi)__A, (__v8hi)__B, |
542 | (__v8hi)__C); |
543 | } |
544 | |
545 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
546 | _mm_mask_shldv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) |
547 | { |
548 | return (__m128i)__builtin_ia32_selectw_128(__U, |
549 | (__v8hi)_mm_shldv_epi16(__A, __B, __C), |
550 | (__v8hi)__A); |
551 | } |
552 | |
553 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
554 | _mm_maskz_shldv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) |
555 | { |
556 | return (__m128i)__builtin_ia32_selectw_128(__U, |
557 | (__v8hi)_mm_shldv_epi16(__A, __B, __C), |
558 | (__v8hi)_mm_setzero_si128()); |
559 | } |
560 | |
561 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
562 | _mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C) |
563 | { |
564 | return (__m256i)__builtin_ia32_vpshrdvq256((__v4di)__A, (__v4di)__B, |
565 | (__v4di)__C); |
566 | } |
567 | |
568 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
569 | _mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) |
570 | { |
571 | return (__m256i)__builtin_ia32_selectq_256(__U, |
572 | (__v4di)_mm256_shrdv_epi64(__A, __B, __C), |
573 | (__v4di)__A); |
574 | } |
575 | |
576 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
577 | _mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) |
578 | { |
579 | return (__m256i)__builtin_ia32_selectq_256(__U, |
580 | (__v4di)_mm256_shrdv_epi64(__A, __B, __C), |
581 | (__v4di)_mm256_setzero_si256()); |
582 | } |
583 | |
584 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
585 | _mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C) |
586 | { |
587 | return (__m128i)__builtin_ia32_vpshrdvq128((__v2di)__A, (__v2di)__B, |
588 | (__v2di)__C); |
589 | } |
590 | |
591 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
592 | _mm_mask_shrdv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) |
593 | { |
594 | return (__m128i)__builtin_ia32_selectq_128(__U, |
595 | (__v2di)_mm_shrdv_epi64(__A, __B, __C), |
596 | (__v2di)__A); |
597 | } |
598 | |
599 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
600 | _mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) |
601 | { |
602 | return (__m128i)__builtin_ia32_selectq_128(__U, |
603 | (__v2di)_mm_shrdv_epi64(__A, __B, __C), |
604 | (__v2di)_mm_setzero_si128()); |
605 | } |
606 | |
607 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
608 | _mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C) |
609 | { |
610 | return (__m256i)__builtin_ia32_vpshrdvd256((__v8si)__A, (__v8si)__B, |
611 | (__v8si)__C); |
612 | } |
613 | |
614 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
615 | _mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) |
616 | { |
617 | return (__m256i)__builtin_ia32_selectd_256(__U, |
618 | (__v8si)_mm256_shrdv_epi32(__A, __B, __C), |
619 | (__v8si)__A); |
620 | } |
621 | |
622 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
623 | _mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) |
624 | { |
625 | return (__m256i)__builtin_ia32_selectd_256(__U, |
626 | (__v8si)_mm256_shrdv_epi32(__A, __B, __C), |
627 | (__v8si)_mm256_setzero_si256()); |
628 | } |
629 | |
630 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
631 | _mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C) |
632 | { |
633 | return (__m128i)__builtin_ia32_vpshrdvd128((__v4si)__A, (__v4si)__B, |
634 | (__v4si)__C); |
635 | } |
636 | |
637 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
638 | _mm_mask_shrdv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) |
639 | { |
640 | return (__m128i)__builtin_ia32_selectd_128(__U, |
641 | (__v4si)_mm_shrdv_epi32(__A, __B, __C), |
642 | (__v4si)__A); |
643 | } |
644 | |
645 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
646 | _mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) |
647 | { |
648 | return (__m128i)__builtin_ia32_selectd_128(__U, |
649 | (__v4si)_mm_shrdv_epi32(__A, __B, __C), |
650 | (__v4si)_mm_setzero_si128()); |
651 | } |
652 | |
653 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
654 | _mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C) |
655 | { |
656 | return (__m256i)__builtin_ia32_vpshrdvw256((__v16hi)__A, (__v16hi)__B, |
657 | (__v16hi)__C); |
658 | } |
659 | |
660 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
661 | _mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C) |
662 | { |
663 | return (__m256i)__builtin_ia32_selectw_256(__U, |
664 | (__v16hi)_mm256_shrdv_epi16(__A, __B, __C), |
665 | (__v16hi)__A); |
666 | } |
667 | |
668 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
669 | _mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C) |
670 | { |
671 | return (__m256i)__builtin_ia32_selectw_256(__U, |
672 | (__v16hi)_mm256_shrdv_epi16(__A, __B, __C), |
673 | (__v16hi)_mm256_setzero_si256()); |
674 | } |
675 | |
676 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
677 | _mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C) |
678 | { |
679 | return (__m128i)__builtin_ia32_vpshrdvw128((__v8hi)__A, (__v8hi)__B, |
680 | (__v8hi)__C); |
681 | } |
682 | |
683 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
684 | _mm_mask_shrdv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) |
685 | { |
686 | return (__m128i)__builtin_ia32_selectw_128(__U, |
687 | (__v8hi)_mm_shrdv_epi16(__A, __B, __C), |
688 | (__v8hi)__A); |
689 | } |
690 | |
691 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
692 | _mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) |
693 | { |
694 | return (__m128i)__builtin_ia32_selectw_128(__U, |
695 | (__v8hi)_mm_shrdv_epi16(__A, __B, __C), |
696 | (__v8hi)_mm_setzero_si128()); |
697 | } |
698 | |
699 | |
700 | #undef __DEFAULT_FN_ATTRS128 |
701 | #undef __DEFAULT_FN_ATTRS256 |
702 | |
703 | #endif |
704 | |