1 | /*===---- smmintrin.h - SSE4 intrinsics ------------------------------------=== |
---|---|
2 | * |
3 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
4 | * of this software and associated documentation files (the "Software"), to deal |
5 | * in the Software without restriction, including without limitation the rights |
6 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
7 | * copies of the Software, and to permit persons to whom the Software is |
8 | * furnished to do so, subject to the following conditions: |
9 | * |
10 | * The above copyright notice and this permission notice shall be included in |
11 | * all copies or substantial portions of the Software. |
12 | * |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
16 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
17 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
18 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
19 | * THE SOFTWARE. |
20 | * |
21 | *===-----------------------------------------------------------------------=== |
22 | */ |
23 | |
24 | #ifndef __SMMINTRIN_H |
25 | #define __SMMINTRIN_H |
26 | |
27 | #include <tmmintrin.h> |
28 | |
29 | /* Define the default attributes for the functions in this file. */ |
30 | #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128))) |
31 | |
32 | /* SSE4 Rounding macros. */ |
33 | #define _MM_FROUND_TO_NEAREST_INT 0x00 |
34 | #define _MM_FROUND_TO_NEG_INF 0x01 |
35 | #define _MM_FROUND_TO_POS_INF 0x02 |
36 | #define _MM_FROUND_TO_ZERO 0x03 |
37 | #define _MM_FROUND_CUR_DIRECTION 0x04 |
38 | |
39 | #define _MM_FROUND_RAISE_EXC 0x00 |
40 | #define _MM_FROUND_NO_EXC 0x08 |
41 | |
42 | #define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT) |
43 | #define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF) |
44 | #define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF) |
45 | #define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO) |
46 | #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) |
47 | #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) |
48 | |
49 | /// Rounds up each element of the 128-bit vector of [4 x float] to an |
50 | /// integer and returns the rounded values in a 128-bit vector of |
51 | /// [4 x float]. |
52 | /// |
53 | /// \headerfile <x86intrin.h> |
54 | /// |
55 | /// \code |
56 | /// __m128 _mm_ceil_ps(__m128 X); |
57 | /// \endcode |
58 | /// |
59 | /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. |
60 | /// |
61 | /// \param X |
62 | /// A 128-bit vector of [4 x float] values to be rounded up. |
63 | /// \returns A 128-bit vector of [4 x float] containing the rounded values. |
64 | #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) |
65 | |
66 | /// Rounds up each element of the 128-bit vector of [2 x double] to an |
67 | /// integer and returns the rounded values in a 128-bit vector of |
68 | /// [2 x double]. |
69 | /// |
70 | /// \headerfile <x86intrin.h> |
71 | /// |
72 | /// \code |
73 | /// __m128d _mm_ceil_pd(__m128d X); |
74 | /// \endcode |
75 | /// |
76 | /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. |
77 | /// |
78 | /// \param X |
79 | /// A 128-bit vector of [2 x double] values to be rounded up. |
80 | /// \returns A 128-bit vector of [2 x double] containing the rounded values. |
81 | #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) |
82 | |
83 | /// Copies three upper elements of the first 128-bit vector operand to |
84 | /// the corresponding three upper elements of the 128-bit result vector of |
85 | /// [4 x float]. Rounds up the lowest element of the second 128-bit vector |
86 | /// operand to an integer and copies it to the lowest element of the 128-bit |
87 | /// result vector of [4 x float]. |
88 | /// |
89 | /// \headerfile <x86intrin.h> |
90 | /// |
91 | /// \code |
92 | /// __m128 _mm_ceil_ss(__m128 X, __m128 Y); |
93 | /// \endcode |
94 | /// |
95 | /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. |
96 | /// |
97 | /// \param X |
98 | /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are |
99 | /// copied to the corresponding bits of the result. |
100 | /// \param Y |
101 | /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is |
102 | /// rounded up to the nearest integer and copied to the corresponding bits |
103 | /// of the result. |
104 | /// \returns A 128-bit vector of [4 x float] containing the copied and rounded |
105 | /// values. |
106 | #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) |
107 | |
108 | /// Copies the upper element of the first 128-bit vector operand to the |
109 | /// corresponding upper element of the 128-bit result vector of [2 x double]. |
110 | /// Rounds up the lower element of the second 128-bit vector operand to an |
111 | /// integer and copies it to the lower element of the 128-bit result vector |
112 | /// of [2 x double]. |
113 | /// |
114 | /// \headerfile <x86intrin.h> |
115 | /// |
116 | /// \code |
117 | /// __m128d _mm_ceil_sd(__m128d X, __m128d Y); |
118 | /// \endcode |
119 | /// |
120 | /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. |
121 | /// |
122 | /// \param X |
123 | /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is |
124 | /// copied to the corresponding bits of the result. |
125 | /// \param Y |
126 | /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is |
127 | /// rounded up to the nearest integer and copied to the corresponding bits |
128 | /// of the result. |
129 | /// \returns A 128-bit vector of [2 x double] containing the copied and rounded |
130 | /// values. |
131 | #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) |
132 | |
133 | /// Rounds down each element of the 128-bit vector of [4 x float] to an |
134 | /// an integer and returns the rounded values in a 128-bit vector of |
135 | /// [4 x float]. |
136 | /// |
137 | /// \headerfile <x86intrin.h> |
138 | /// |
139 | /// \code |
140 | /// __m128 _mm_floor_ps(__m128 X); |
141 | /// \endcode |
142 | /// |
143 | /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. |
144 | /// |
145 | /// \param X |
146 | /// A 128-bit vector of [4 x float] values to be rounded down. |
147 | /// \returns A 128-bit vector of [4 x float] containing the rounded values. |
148 | #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) |
149 | |
150 | /// Rounds down each element of the 128-bit vector of [2 x double] to an |
151 | /// integer and returns the rounded values in a 128-bit vector of |
152 | /// [2 x double]. |
153 | /// |
154 | /// \headerfile <x86intrin.h> |
155 | /// |
156 | /// \code |
157 | /// __m128d _mm_floor_pd(__m128d X); |
158 | /// \endcode |
159 | /// |
160 | /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. |
161 | /// |
162 | /// \param X |
163 | /// A 128-bit vector of [2 x double]. |
164 | /// \returns A 128-bit vector of [2 x double] containing the rounded values. |
165 | #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) |
166 | |
167 | /// Copies three upper elements of the first 128-bit vector operand to |
168 | /// the corresponding three upper elements of the 128-bit result vector of |
169 | /// [4 x float]. Rounds down the lowest element of the second 128-bit vector |
170 | /// operand to an integer and copies it to the lowest element of the 128-bit |
171 | /// result vector of [4 x float]. |
172 | /// |
173 | /// \headerfile <x86intrin.h> |
174 | /// |
175 | /// \code |
176 | /// __m128 _mm_floor_ss(__m128 X, __m128 Y); |
177 | /// \endcode |
178 | /// |
179 | /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. |
180 | /// |
181 | /// \param X |
182 | /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are |
183 | /// copied to the corresponding bits of the result. |
184 | /// \param Y |
185 | /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is |
186 | /// rounded down to the nearest integer and copied to the corresponding bits |
187 | /// of the result. |
188 | /// \returns A 128-bit vector of [4 x float] containing the copied and rounded |
189 | /// values. |
190 | #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) |
191 | |
192 | /// Copies the upper element of the first 128-bit vector operand to the |
193 | /// corresponding upper element of the 128-bit result vector of [2 x double]. |
194 | /// Rounds down the lower element of the second 128-bit vector operand to an |
195 | /// integer and copies it to the lower element of the 128-bit result vector |
196 | /// of [2 x double]. |
197 | /// |
198 | /// \headerfile <x86intrin.h> |
199 | /// |
200 | /// \code |
201 | /// __m128d _mm_floor_sd(__m128d X, __m128d Y); |
202 | /// \endcode |
203 | /// |
204 | /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. |
205 | /// |
206 | /// \param X |
207 | /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is |
208 | /// copied to the corresponding bits of the result. |
209 | /// \param Y |
210 | /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is |
211 | /// rounded down to the nearest integer and copied to the corresponding bits |
212 | /// of the result. |
213 | /// \returns A 128-bit vector of [2 x double] containing the copied and rounded |
214 | /// values. |
215 | #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) |
216 | |
217 | /// Rounds each element of the 128-bit vector of [4 x float] to an |
218 | /// integer value according to the rounding control specified by the second |
219 | /// argument and returns the rounded values in a 128-bit vector of |
220 | /// [4 x float]. |
221 | /// |
222 | /// \headerfile <x86intrin.h> |
223 | /// |
224 | /// \code |
225 | /// __m128 _mm_round_ps(__m128 X, const int M); |
226 | /// \endcode |
227 | /// |
228 | /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. |
229 | /// |
230 | /// \param X |
231 | /// A 128-bit vector of [4 x float]. |
232 | /// \param M |
233 | /// An integer value that specifies the rounding operation. \n |
234 | /// Bits [7:4] are reserved. \n |
235 | /// Bit [3] is a precision exception value: \n |
236 | /// 0: A normal PE exception is used \n |
237 | /// 1: The PE field is not updated \n |
238 | /// Bit [2] is the rounding control source: \n |
239 | /// 0: Use bits [1:0] of \a M \n |
240 | /// 1: Use the current MXCSR setting \n |
241 | /// Bits [1:0] contain the rounding control definition: \n |
242 | /// 00: Nearest \n |
243 | /// 01: Downward (toward negative infinity) \n |
244 | /// 10: Upward (toward positive infinity) \n |
245 | /// 11: Truncated |
246 | /// \returns A 128-bit vector of [4 x float] containing the rounded values. |
247 | #define _mm_round_ps(X, M) \ |
248 | (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)) |
249 | |
250 | /// Copies three upper elements of the first 128-bit vector operand to |
251 | /// the corresponding three upper elements of the 128-bit result vector of |
252 | /// [4 x float]. Rounds the lowest element of the second 128-bit vector |
253 | /// operand to an integer value according to the rounding control specified |
254 | /// by the third argument and copies it to the lowest element of the 128-bit |
255 | /// result vector of [4 x float]. |
256 | /// |
257 | /// \headerfile <x86intrin.h> |
258 | /// |
259 | /// \code |
260 | /// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M); |
261 | /// \endcode |
262 | /// |
263 | /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. |
264 | /// |
265 | /// \param X |
266 | /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are |
267 | /// copied to the corresponding bits of the result. |
268 | /// \param Y |
269 | /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is |
270 | /// rounded to the nearest integer using the specified rounding control and |
271 | /// copied to the corresponding bits of the result. |
272 | /// \param M |
273 | /// An integer value that specifies the rounding operation. \n |
274 | /// Bits [7:4] are reserved. \n |
275 | /// Bit [3] is a precision exception value: \n |
276 | /// 0: A normal PE exception is used \n |
277 | /// 1: The PE field is not updated \n |
278 | /// Bit [2] is the rounding control source: \n |
279 | /// 0: Use bits [1:0] of \a M \n |
280 | /// 1: Use the current MXCSR setting \n |
281 | /// Bits [1:0] contain the rounding control definition: \n |
282 | /// 00: Nearest \n |
283 | /// 01: Downward (toward negative infinity) \n |
284 | /// 10: Upward (toward positive infinity) \n |
285 | /// 11: Truncated |
286 | /// \returns A 128-bit vector of [4 x float] containing the copied and rounded |
287 | /// values. |
288 | #define _mm_round_ss(X, Y, M) \ |
289 | (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \ |
290 | (__v4sf)(__m128)(Y), (M)) |
291 | |
292 | /// Rounds each element of the 128-bit vector of [2 x double] to an |
293 | /// integer value according to the rounding control specified by the second |
294 | /// argument and returns the rounded values in a 128-bit vector of |
295 | /// [2 x double]. |
296 | /// |
297 | /// \headerfile <x86intrin.h> |
298 | /// |
299 | /// \code |
300 | /// __m128d _mm_round_pd(__m128d X, const int M); |
301 | /// \endcode |
302 | /// |
303 | /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. |
304 | /// |
305 | /// \param X |
306 | /// A 128-bit vector of [2 x double]. |
307 | /// \param M |
308 | /// An integer value that specifies the rounding operation. \n |
309 | /// Bits [7:4] are reserved. \n |
310 | /// Bit [3] is a precision exception value: \n |
311 | /// 0: A normal PE exception is used \n |
312 | /// 1: The PE field is not updated \n |
313 | /// Bit [2] is the rounding control source: \n |
314 | /// 0: Use bits [1:0] of \a M \n |
315 | /// 1: Use the current MXCSR setting \n |
316 | /// Bits [1:0] contain the rounding control definition: \n |
317 | /// 00: Nearest \n |
318 | /// 01: Downward (toward negative infinity) \n |
319 | /// 10: Upward (toward positive infinity) \n |
320 | /// 11: Truncated |
321 | /// \returns A 128-bit vector of [2 x double] containing the rounded values. |
322 | #define _mm_round_pd(X, M) \ |
323 | (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)) |
324 | |
325 | /// Copies the upper element of the first 128-bit vector operand to the |
326 | /// corresponding upper element of the 128-bit result vector of [2 x double]. |
327 | /// Rounds the lower element of the second 128-bit vector operand to an |
328 | /// integer value according to the rounding control specified by the third |
329 | /// argument and copies it to the lower element of the 128-bit result vector |
330 | /// of [2 x double]. |
331 | /// |
332 | /// \headerfile <x86intrin.h> |
333 | /// |
334 | /// \code |
335 | /// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M); |
336 | /// \endcode |
337 | /// |
338 | /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. |
339 | /// |
340 | /// \param X |
341 | /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is |
342 | /// copied to the corresponding bits of the result. |
343 | /// \param Y |
344 | /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is |
345 | /// rounded to the nearest integer using the specified rounding control and |
346 | /// copied to the corresponding bits of the result. |
347 | /// \param M |
348 | /// An integer value that specifies the rounding operation. \n |
349 | /// Bits [7:4] are reserved. \n |
350 | /// Bit [3] is a precision exception value: \n |
351 | /// 0: A normal PE exception is used \n |
352 | /// 1: The PE field is not updated \n |
353 | /// Bit [2] is the rounding control source: \n |
354 | /// 0: Use bits [1:0] of \a M \n |
355 | /// 1: Use the current MXCSR setting \n |
356 | /// Bits [1:0] contain the rounding control definition: \n |
357 | /// 00: Nearest \n |
358 | /// 01: Downward (toward negative infinity) \n |
359 | /// 10: Upward (toward positive infinity) \n |
360 | /// 11: Truncated |
361 | /// \returns A 128-bit vector of [2 x double] containing the copied and rounded |
362 | /// values. |
363 | #define _mm_round_sd(X, Y, M) \ |
364 | (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \ |
365 | (__v2df)(__m128d)(Y), (M)) |
366 | |
367 | /* SSE4 Packed Blending Intrinsics. */ |
368 | /// Returns a 128-bit vector of [2 x double] where the values are |
369 | /// selected from either the first or second operand as specified by the |
370 | /// third operand, the control mask. |
371 | /// |
372 | /// \headerfile <x86intrin.h> |
373 | /// |
374 | /// \code |
375 | /// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M); |
376 | /// \endcode |
377 | /// |
378 | /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. |
379 | /// |
380 | /// \param V1 |
381 | /// A 128-bit vector of [2 x double]. |
382 | /// \param V2 |
383 | /// A 128-bit vector of [2 x double]. |
384 | /// \param M |
385 | /// An immediate integer operand, with mask bits [1:0] specifying how the |
386 | /// values are to be copied. The position of the mask bit corresponds to the |
387 | /// index of a copied value. When a mask bit is 0, the corresponding 64-bit |
388 | /// element in operand \a V1 is copied to the same position in the result. |
389 | /// When a mask bit is 1, the corresponding 64-bit element in operand \a V2 |
390 | /// is copied to the same position in the result. |
391 | /// \returns A 128-bit vector of [2 x double] containing the copied values. |
392 | #define _mm_blend_pd(V1, V2, M) \ |
393 | (__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \ |
394 | (__v2df)(__m128d)(V2), (int)(M)) |
395 | |
396 | /// Returns a 128-bit vector of [4 x float] where the values are selected |
397 | /// from either the first or second operand as specified by the third |
398 | /// operand, the control mask. |
399 | /// |
400 | /// \headerfile <x86intrin.h> |
401 | /// |
402 | /// \code |
403 | /// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M); |
404 | /// \endcode |
405 | /// |
406 | /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction. |
407 | /// |
408 | /// \param V1 |
409 | /// A 128-bit vector of [4 x float]. |
410 | /// \param V2 |
411 | /// A 128-bit vector of [4 x float]. |
412 | /// \param M |
413 | /// An immediate integer operand, with mask bits [3:0] specifying how the |
414 | /// values are to be copied. The position of the mask bit corresponds to the |
415 | /// index of a copied value. When a mask bit is 0, the corresponding 32-bit |
416 | /// element in operand \a V1 is copied to the same position in the result. |
417 | /// When a mask bit is 1, the corresponding 32-bit element in operand \a V2 |
418 | /// is copied to the same position in the result. |
419 | /// \returns A 128-bit vector of [4 x float] containing the copied values. |
420 | #define _mm_blend_ps(V1, V2, M) \ |
421 | (__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \ |
422 | (__v4sf)(__m128)(V2), (int)(M)) |
423 | |
424 | /// Returns a 128-bit vector of [2 x double] where the values are |
425 | /// selected from either the first or second operand as specified by the |
426 | /// third operand, the control mask. |
427 | /// |
428 | /// \headerfile <x86intrin.h> |
429 | /// |
430 | /// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction. |
431 | /// |
432 | /// \param __V1 |
433 | /// A 128-bit vector of [2 x double]. |
434 | /// \param __V2 |
435 | /// A 128-bit vector of [2 x double]. |
436 | /// \param __M |
437 | /// A 128-bit vector operand, with mask bits 127 and 63 specifying how the |
438 | /// values are to be copied. The position of the mask bit corresponds to the |
439 | /// most significant bit of a copied value. When a mask bit is 0, the |
440 | /// corresponding 64-bit element in operand \a __V1 is copied to the same |
441 | /// position in the result. When a mask bit is 1, the corresponding 64-bit |
442 | /// element in operand \a __V2 is copied to the same position in the result. |
443 | /// \returns A 128-bit vector of [2 x double] containing the copied values. |
444 | static __inline__ __m128d __DEFAULT_FN_ATTRS |
445 | _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) |
446 | { |
447 | return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2, |
448 | (__v2df)__M); |
449 | } |
450 | |
451 | /// Returns a 128-bit vector of [4 x float] where the values are |
452 | /// selected from either the first or second operand as specified by the |
453 | /// third operand, the control mask. |
454 | /// |
455 | /// \headerfile <x86intrin.h> |
456 | /// |
457 | /// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction. |
458 | /// |
459 | /// \param __V1 |
460 | /// A 128-bit vector of [4 x float]. |
461 | /// \param __V2 |
462 | /// A 128-bit vector of [4 x float]. |
463 | /// \param __M |
464 | /// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying |
465 | /// how the values are to be copied. The position of the mask bit corresponds |
466 | /// to the most significant bit of a copied value. When a mask bit is 0, the |
467 | /// corresponding 32-bit element in operand \a __V1 is copied to the same |
468 | /// position in the result. When a mask bit is 1, the corresponding 32-bit |
469 | /// element in operand \a __V2 is copied to the same position in the result. |
470 | /// \returns A 128-bit vector of [4 x float] containing the copied values. |
471 | static __inline__ __m128 __DEFAULT_FN_ATTRS |
472 | _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) |
473 | { |
474 | return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2, |
475 | (__v4sf)__M); |
476 | } |
477 | |
478 | /// Returns a 128-bit vector of [16 x i8] where the values are selected |
479 | /// from either of the first or second operand as specified by the third |
480 | /// operand, the control mask. |
481 | /// |
482 | /// \headerfile <x86intrin.h> |
483 | /// |
484 | /// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction. |
485 | /// |
486 | /// \param __V1 |
487 | /// A 128-bit vector of [16 x i8]. |
488 | /// \param __V2 |
489 | /// A 128-bit vector of [16 x i8]. |
490 | /// \param __M |
491 | /// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying |
492 | /// how the values are to be copied. The position of the mask bit corresponds |
493 | /// to the most significant bit of a copied value. When a mask bit is 0, the |
494 | /// corresponding 8-bit element in operand \a __V1 is copied to the same |
495 | /// position in the result. When a mask bit is 1, the corresponding 8-bit |
496 | /// element in operand \a __V2 is copied to the same position in the result. |
497 | /// \returns A 128-bit vector of [16 x i8] containing the copied values. |
498 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
499 | _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) |
500 | { |
501 | return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2, |
502 | (__v16qi)__M); |
503 | } |
504 | |
505 | /// Returns a 128-bit vector of [8 x i16] where the values are selected |
506 | /// from either of the first or second operand as specified by the third |
507 | /// operand, the control mask. |
508 | /// |
509 | /// \headerfile <x86intrin.h> |
510 | /// |
511 | /// \code |
512 | /// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M); |
513 | /// \endcode |
514 | /// |
515 | /// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction. |
516 | /// |
517 | /// \param V1 |
518 | /// A 128-bit vector of [8 x i16]. |
519 | /// \param V2 |
520 | /// A 128-bit vector of [8 x i16]. |
521 | /// \param M |
522 | /// An immediate integer operand, with mask bits [7:0] specifying how the |
523 | /// values are to be copied. The position of the mask bit corresponds to the |
524 | /// index of a copied value. When a mask bit is 0, the corresponding 16-bit |
525 | /// element in operand \a V1 is copied to the same position in the result. |
526 | /// When a mask bit is 1, the corresponding 16-bit element in operand \a V2 |
527 | /// is copied to the same position in the result. |
528 | /// \returns A 128-bit vector of [8 x i16] containing the copied values. |
529 | #define _mm_blend_epi16(V1, V2, M) \ |
530 | (__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \ |
531 | (__v8hi)(__m128i)(V2), (int)(M)) |
532 | |
533 | /* SSE4 Dword Multiply Instructions. */ |
534 | /// Multiples corresponding elements of two 128-bit vectors of [4 x i32] |
535 | /// and returns the lower 32 bits of the each product in a 128-bit vector of |
536 | /// [4 x i32]. |
537 | /// |
538 | /// \headerfile <x86intrin.h> |
539 | /// |
540 | /// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction. |
541 | /// |
542 | /// \param __V1 |
543 | /// A 128-bit integer vector. |
544 | /// \param __V2 |
545 | /// A 128-bit integer vector. |
546 | /// \returns A 128-bit integer vector containing the products of both operands. |
547 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
548 | _mm_mullo_epi32 (__m128i __V1, __m128i __V2) |
549 | { |
550 | return (__m128i) ((__v4su)__V1 * (__v4su)__V2); |
551 | } |
552 | |
553 | /// Multiplies corresponding even-indexed elements of two 128-bit |
554 | /// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64] |
555 | /// containing the products. |
556 | /// |
557 | /// \headerfile <x86intrin.h> |
558 | /// |
559 | /// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction. |
560 | /// |
561 | /// \param __V1 |
562 | /// A 128-bit vector of [4 x i32]. |
563 | /// \param __V2 |
564 | /// A 128-bit vector of [4 x i32]. |
565 | /// \returns A 128-bit vector of [2 x i64] containing the products of both |
566 | /// operands. |
567 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
568 | _mm_mul_epi32 (__m128i __V1, __m128i __V2) |
569 | { |
570 | return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2); |
571 | } |
572 | |
573 | /* SSE4 Floating Point Dot Product Instructions. */ |
574 | /// Computes the dot product of the two 128-bit vectors of [4 x float] |
575 | /// and returns it in the elements of the 128-bit result vector of |
576 | /// [4 x float]. |
577 | /// |
578 | /// The immediate integer operand controls which input elements |
579 | /// will contribute to the dot product, and where the final results are |
580 | /// returned. |
581 | /// |
582 | /// \headerfile <x86intrin.h> |
583 | /// |
584 | /// \code |
585 | /// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M); |
586 | /// \endcode |
587 | /// |
588 | /// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction. |
589 | /// |
590 | /// \param X |
591 | /// A 128-bit vector of [4 x float]. |
592 | /// \param Y |
593 | /// A 128-bit vector of [4 x float]. |
594 | /// \param M |
595 | /// An immediate integer operand. Mask bits [7:4] determine which elements |
596 | /// of the input vectors are used, with bit [4] corresponding to the lowest |
597 | /// element and bit [7] corresponding to the highest element of each [4 x |
598 | /// float] vector. If a bit is set, the corresponding elements from the two |
599 | /// input vectors are used as an input for dot product; otherwise that input |
600 | /// is treated as zero. Bits [3:0] determine which elements of the result |
601 | /// will receive a copy of the final dot product, with bit [0] corresponding |
602 | /// to the lowest element and bit [3] corresponding to the highest element of |
603 | /// each [4 x float] subvector. If a bit is set, the dot product is returned |
604 | /// in the corresponding element; otherwise that element is set to zero. |
605 | /// \returns A 128-bit vector of [4 x float] containing the dot product. |
606 | #define _mm_dp_ps(X, Y, M) \ |
607 | (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \ |
608 | (__v4sf)(__m128)(Y), (M)) |
609 | |
610 | /// Computes the dot product of the two 128-bit vectors of [2 x double] |
611 | /// and returns it in the elements of the 128-bit result vector of |
612 | /// [2 x double]. |
613 | /// |
614 | /// The immediate integer operand controls which input |
615 | /// elements will contribute to the dot product, and where the final results |
616 | /// are returned. |
617 | /// |
618 | /// \headerfile <x86intrin.h> |
619 | /// |
620 | /// \code |
621 | /// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M); |
622 | /// \endcode |
623 | /// |
624 | /// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction. |
625 | /// |
626 | /// \param X |
627 | /// A 128-bit vector of [2 x double]. |
628 | /// \param Y |
629 | /// A 128-bit vector of [2 x double]. |
630 | /// \param M |
631 | /// An immediate integer operand. Mask bits [5:4] determine which elements |
632 | /// of the input vectors are used, with bit [4] corresponding to the lowest |
633 | /// element and bit [5] corresponding to the highest element of each of [2 x |
634 | /// double] vector. If a bit is set, the corresponding elements from the two |
635 | /// input vectors are used as an input for dot product; otherwise that input |
636 | /// is treated as zero. Bits [1:0] determine which elements of the result |
637 | /// will receive a copy of the final dot product, with bit [0] corresponding |
638 | /// to the lowest element and bit [1] corresponding to the highest element of |
639 | /// each [2 x double] vector. If a bit is set, the dot product is returned in |
640 | /// the corresponding element; otherwise that element is set to zero. |
641 | #define _mm_dp_pd(X, Y, M) \ |
642 | (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \ |
643 | (__v2df)(__m128d)(Y), (M)) |
644 | |
645 | /* SSE4 Streaming Load Hint Instruction. */ |
646 | /// Loads integer values from a 128-bit aligned memory location to a |
647 | /// 128-bit integer vector. |
648 | /// |
649 | /// \headerfile <x86intrin.h> |
650 | /// |
651 | /// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction. |
652 | /// |
653 | /// \param __V |
654 | /// A pointer to a 128-bit aligned memory location that contains the integer |
655 | /// values. |
656 | /// \returns A 128-bit integer vector containing the data stored at the |
657 | /// specified memory location. |
658 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
659 | _mm_stream_load_si128 (__m128i const *__V) |
660 | { |
661 | return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V); |
662 | } |
663 | |
664 | /* SSE4 Packed Integer Min/Max Instructions. */ |
665 | /// Compares the corresponding elements of two 128-bit vectors of |
666 | /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser |
667 | /// of the two values. |
668 | /// |
669 | /// \headerfile <x86intrin.h> |
670 | /// |
671 | /// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction. |
672 | /// |
673 | /// \param __V1 |
674 | /// A 128-bit vector of [16 x i8]. |
675 | /// \param __V2 |
676 | /// A 128-bit vector of [16 x i8] |
677 | /// \returns A 128-bit vector of [16 x i8] containing the lesser values. |
678 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
679 | _mm_min_epi8 (__m128i __V1, __m128i __V2) |
680 | { |
681 | return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2); |
682 | } |
683 | |
684 | /// Compares the corresponding elements of two 128-bit vectors of |
685 | /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the |
686 | /// greater value of the two. |
687 | /// |
688 | /// \headerfile <x86intrin.h> |
689 | /// |
690 | /// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction. |
691 | /// |
692 | /// \param __V1 |
693 | /// A 128-bit vector of [16 x i8]. |
694 | /// \param __V2 |
695 | /// A 128-bit vector of [16 x i8]. |
696 | /// \returns A 128-bit vector of [16 x i8] containing the greater values. |
697 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
698 | _mm_max_epi8 (__m128i __V1, __m128i __V2) |
699 | { |
700 | return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2); |
701 | } |
702 | |
703 | /// Compares the corresponding elements of two 128-bit vectors of |
704 | /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser |
705 | /// value of the two. |
706 | /// |
707 | /// \headerfile <x86intrin.h> |
708 | /// |
709 | /// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction. |
710 | /// |
711 | /// \param __V1 |
712 | /// A 128-bit vector of [8 x u16]. |
713 | /// \param __V2 |
714 | /// A 128-bit vector of [8 x u16]. |
715 | /// \returns A 128-bit vector of [8 x u16] containing the lesser values. |
716 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
717 | _mm_min_epu16 (__m128i __V1, __m128i __V2) |
718 | { |
719 | return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2); |
720 | } |
721 | |
722 | /// Compares the corresponding elements of two 128-bit vectors of |
723 | /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the |
724 | /// greater value of the two. |
725 | /// |
726 | /// \headerfile <x86intrin.h> |
727 | /// |
728 | /// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction. |
729 | /// |
730 | /// \param __V1 |
731 | /// A 128-bit vector of [8 x u16]. |
732 | /// \param __V2 |
733 | /// A 128-bit vector of [8 x u16]. |
734 | /// \returns A 128-bit vector of [8 x u16] containing the greater values. |
735 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
736 | _mm_max_epu16 (__m128i __V1, __m128i __V2) |
737 | { |
738 | return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2); |
739 | } |
740 | |
741 | /// Compares the corresponding elements of two 128-bit vectors of |
742 | /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser |
743 | /// value of the two. |
744 | /// |
745 | /// \headerfile <x86intrin.h> |
746 | /// |
747 | /// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction. |
748 | /// |
749 | /// \param __V1 |
750 | /// A 128-bit vector of [4 x i32]. |
751 | /// \param __V2 |
752 | /// A 128-bit vector of [4 x i32]. |
753 | /// \returns A 128-bit vector of [4 x i32] containing the lesser values. |
754 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
755 | _mm_min_epi32 (__m128i __V1, __m128i __V2) |
756 | { |
757 | return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2); |
758 | } |
759 | |
760 | /// Compares the corresponding elements of two 128-bit vectors of |
761 | /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the |
762 | /// greater value of the two. |
763 | /// |
764 | /// \headerfile <x86intrin.h> |
765 | /// |
766 | /// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction. |
767 | /// |
768 | /// \param __V1 |
769 | /// A 128-bit vector of [4 x i32]. |
770 | /// \param __V2 |
771 | /// A 128-bit vector of [4 x i32]. |
772 | /// \returns A 128-bit vector of [4 x i32] containing the greater values. |
773 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
774 | _mm_max_epi32 (__m128i __V1, __m128i __V2) |
775 | { |
776 | return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2); |
777 | } |
778 | |
779 | /// Compares the corresponding elements of two 128-bit vectors of |
780 | /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser |
781 | /// value of the two. |
782 | /// |
783 | /// \headerfile <x86intrin.h> |
784 | /// |
785 | /// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction. |
786 | /// |
787 | /// \param __V1 |
788 | /// A 128-bit vector of [4 x u32]. |
789 | /// \param __V2 |
790 | /// A 128-bit vector of [4 x u32]. |
791 | /// \returns A 128-bit vector of [4 x u32] containing the lesser values. |
792 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
793 | _mm_min_epu32 (__m128i __V1, __m128i __V2) |
794 | { |
795 | return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2); |
796 | } |
797 | |
798 | /// Compares the corresponding elements of two 128-bit vectors of |
799 | /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the |
800 | /// greater value of the two. |
801 | /// |
802 | /// \headerfile <x86intrin.h> |
803 | /// |
804 | /// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction. |
805 | /// |
806 | /// \param __V1 |
807 | /// A 128-bit vector of [4 x u32]. |
808 | /// \param __V2 |
809 | /// A 128-bit vector of [4 x u32]. |
810 | /// \returns A 128-bit vector of [4 x u32] containing the greater values. |
811 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
812 | _mm_max_epu32 (__m128i __V1, __m128i __V2) |
813 | { |
814 | return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2); |
815 | } |
816 | |
817 | /* SSE4 Insertion and Extraction from XMM Register Instructions. */ |
818 | /// Takes the first argument \a X and inserts an element from the second |
819 | /// argument \a Y as selected by the third argument \a N. That result then |
820 | /// has elements zeroed out also as selected by the third argument \a N. The |
821 | /// resulting 128-bit vector of [4 x float] is then returned. |
822 | /// |
823 | /// \headerfile <x86intrin.h> |
824 | /// |
825 | /// \code |
826 | /// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N); |
827 | /// \endcode |
828 | /// |
829 | /// This intrinsic corresponds to the <c> VINSERTPS </c> instruction. |
830 | /// |
831 | /// \param X |
832 | /// A 128-bit vector source operand of [4 x float]. With the exception of |
833 | /// those bits in the result copied from parameter \a Y and zeroed by bits |
834 | /// [3:0] of \a N, all bits from this parameter are copied to the result. |
835 | /// \param Y |
836 | /// A 128-bit vector source operand of [4 x float]. One single-precision |
837 | /// floating-point element from this source, as determined by the immediate |
838 | /// parameter, is copied to the result. |
839 | /// \param N |
840 | /// Specifies which bits from operand \a Y will be copied, which bits in the |
841 | /// result they will be be copied to, and which bits in the result will be |
842 | /// cleared. The following assignments are made: \n |
843 | /// Bits [7:6] specify the bits to copy from operand \a Y: \n |
844 | /// 00: Selects bits [31:0] from operand \a Y. \n |
845 | /// 01: Selects bits [63:32] from operand \a Y. \n |
846 | /// 10: Selects bits [95:64] from operand \a Y. \n |
847 | /// 11: Selects bits [127:96] from operand \a Y. \n |
848 | /// Bits [5:4] specify the bits in the result to which the selected bits |
849 | /// from operand \a Y are copied: \n |
850 | /// 00: Copies the selected bits from \a Y to result bits [31:0]. \n |
851 | /// 01: Copies the selected bits from \a Y to result bits [63:32]. \n |
852 | /// 10: Copies the selected bits from \a Y to result bits [95:64]. \n |
853 | /// 11: Copies the selected bits from \a Y to result bits [127:96]. \n |
854 | /// Bits[3:0]: If any of these bits are set, the corresponding result |
855 | /// element is cleared. |
856 | /// \returns A 128-bit vector of [4 x float] containing the copied |
857 | /// single-precision floating point elements from the operands. |
858 | #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) |
859 | |
860 | /// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and |
861 | /// returns it, using the immediate value parameter \a N as a selector. |
862 | /// |
863 | /// \headerfile <x86intrin.h> |
864 | /// |
865 | /// \code |
866 | /// int _mm_extract_ps(__m128 X, const int N); |
867 | /// \endcode |
868 | /// |
869 | /// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c> |
870 | /// instruction. |
871 | /// |
872 | /// \param X |
873 | /// A 128-bit vector of [4 x float]. |
874 | /// \param N |
875 | /// An immediate value. Bits [1:0] determines which bits from the argument |
876 | /// \a X are extracted and returned: \n |
877 | /// 00: Bits [31:0] of parameter \a X are returned. \n |
878 | /// 01: Bits [63:32] of parameter \a X are returned. \n |
879 | /// 10: Bits [95:64] of parameter \a X are returned. \n |
880 | /// 11: Bits [127:96] of parameter \a X are returned. |
881 | /// \returns A 32-bit integer containing the extracted 32 bits of float data. |
882 | #define _mm_extract_ps(X, N) (__extension__ \ |
883 | ({ union { int __i; float __f; } __t; \ |
884 | __t.__f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \ |
885 | __t.__i;})) |
886 | |
887 | /* Miscellaneous insert and extract macros. */ |
888 | /* Extract a single-precision float from X at index N into D. */ |
889 | #define _MM_EXTRACT_FLOAT(D, X, N) \ |
890 | { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } |
891 | |
892 | /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create |
893 | an index suitable for _mm_insert_ps. */ |
894 | #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) |
895 | |
896 | /* Extract a float from X at index N into the first index of the return. */ |
897 | #define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \ |
898 | _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) |
899 | |
900 | /* Insert int into packed integer array at index. */ |
901 | /// Constructs a 128-bit vector of [16 x i8] by first making a copy of |
902 | /// the 128-bit integer vector parameter, and then inserting the lower 8 bits |
903 | /// of an integer parameter \a I into an offset specified by the immediate |
904 | /// value parameter \a N. |
905 | /// |
906 | /// \headerfile <x86intrin.h> |
907 | /// |
908 | /// \code |
909 | /// __m128i _mm_insert_epi8(__m128i X, int I, const int N); |
910 | /// \endcode |
911 | /// |
912 | /// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction. |
913 | /// |
914 | /// \param X |
915 | /// A 128-bit integer vector of [16 x i8]. This vector is copied to the |
916 | /// result and then one of the sixteen elements in the result vector is |
917 | /// replaced by the lower 8 bits of \a I. |
918 | /// \param I |
919 | /// An integer. The lower 8 bits of this operand are written to the result |
920 | /// beginning at the offset specified by \a N. |
921 | /// \param N |
922 | /// An immediate value. Bits [3:0] specify the bit offset in the result at |
923 | /// which the lower 8 bits of \a I are written. \n |
924 | /// 0000: Bits [7:0] of the result are used for insertion. \n |
925 | /// 0001: Bits [15:8] of the result are used for insertion. \n |
926 | /// 0010: Bits [23:16] of the result are used for insertion. \n |
927 | /// 0011: Bits [31:24] of the result are used for insertion. \n |
928 | /// 0100: Bits [39:32] of the result are used for insertion. \n |
929 | /// 0101: Bits [47:40] of the result are used for insertion. \n |
930 | /// 0110: Bits [55:48] of the result are used for insertion. \n |
931 | /// 0111: Bits [63:56] of the result are used for insertion. \n |
932 | /// 1000: Bits [71:64] of the result are used for insertion. \n |
933 | /// 1001: Bits [79:72] of the result are used for insertion. \n |
934 | /// 1010: Bits [87:80] of the result are used for insertion. \n |
935 | /// 1011: Bits [95:88] of the result are used for insertion. \n |
936 | /// 1100: Bits [103:96] of the result are used for insertion. \n |
937 | /// 1101: Bits [111:104] of the result are used for insertion. \n |
938 | /// 1110: Bits [119:112] of the result are used for insertion. \n |
939 | /// 1111: Bits [127:120] of the result are used for insertion. |
940 | /// \returns A 128-bit integer vector containing the constructed values. |
941 | #define _mm_insert_epi8(X, I, N) \ |
942 | (__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \ |
943 | (int)(I), (int)(N)) |
944 | |
945 | /// Constructs a 128-bit vector of [4 x i32] by first making a copy of |
946 | /// the 128-bit integer vector parameter, and then inserting the 32-bit |
947 | /// integer parameter \a I at the offset specified by the immediate value |
948 | /// parameter \a N. |
949 | /// |
950 | /// \headerfile <x86intrin.h> |
951 | /// |
952 | /// \code |
953 | /// __m128i _mm_insert_epi32(__m128i X, int I, const int N); |
954 | /// \endcode |
955 | /// |
956 | /// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction. |
957 | /// |
958 | /// \param X |
959 | /// A 128-bit integer vector of [4 x i32]. This vector is copied to the |
960 | /// result and then one of the four elements in the result vector is |
961 | /// replaced by \a I. |
962 | /// \param I |
963 | /// A 32-bit integer that is written to the result beginning at the offset |
964 | /// specified by \a N. |
965 | /// \param N |
966 | /// An immediate value. Bits [1:0] specify the bit offset in the result at |
967 | /// which the integer \a I is written. \n |
968 | /// 00: Bits [31:0] of the result are used for insertion. \n |
969 | /// 01: Bits [63:32] of the result are used for insertion. \n |
970 | /// 10: Bits [95:64] of the result are used for insertion. \n |
971 | /// 11: Bits [127:96] of the result are used for insertion. |
972 | /// \returns A 128-bit integer vector containing the constructed values. |
973 | #define _mm_insert_epi32(X, I, N) \ |
974 | (__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \ |
975 | (int)(I), (int)(N)) |
976 | |
977 | #ifdef __x86_64__ |
978 | /// Constructs a 128-bit vector of [2 x i64] by first making a copy of |
979 | /// the 128-bit integer vector parameter, and then inserting the 64-bit |
980 | /// integer parameter \a I, using the immediate value parameter \a N as an |
981 | /// insertion location selector. |
982 | /// |
983 | /// \headerfile <x86intrin.h> |
984 | /// |
985 | /// \code |
986 | /// __m128i _mm_insert_epi64(__m128i X, long long I, const int N); |
987 | /// \endcode |
988 | /// |
989 | /// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction. |
990 | /// |
991 | /// \param X |
992 | /// A 128-bit integer vector of [2 x i64]. This vector is copied to the |
993 | /// result and then one of the two elements in the result vector is replaced |
994 | /// by \a I. |
995 | /// \param I |
996 | /// A 64-bit integer that is written to the result beginning at the offset |
997 | /// specified by \a N. |
998 | /// \param N |
999 | /// An immediate value. Bit [0] specifies the bit offset in the result at |
1000 | /// which the integer \a I is written. \n |
1001 | /// 0: Bits [63:0] of the result are used for insertion. \n |
1002 | /// 1: Bits [127:64] of the result are used for insertion. \n |
1003 | /// \returns A 128-bit integer vector containing the constructed values. |
1004 | #define _mm_insert_epi64(X, I, N) \ |
1005 | (__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \ |
1006 | (long long)(I), (int)(N)) |
1007 | #endif /* __x86_64__ */ |
1008 | |
1009 | /* Extract int from packed integer array at index. This returns the element |
1010 | * as a zero extended value, so it is unsigned. |
1011 | */ |
1012 | /// Extracts an 8-bit element from the 128-bit integer vector of |
1013 | /// [16 x i8], using the immediate value parameter \a N as a selector. |
1014 | /// |
1015 | /// \headerfile <x86intrin.h> |
1016 | /// |
1017 | /// \code |
1018 | /// int _mm_extract_epi8(__m128i X, const int N); |
1019 | /// \endcode |
1020 | /// |
1021 | /// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction. |
1022 | /// |
1023 | /// \param X |
1024 | /// A 128-bit integer vector. |
1025 | /// \param N |
1026 | /// An immediate value. Bits [3:0] specify which 8-bit vector element from |
1027 | /// the argument \a X to extract and copy to the result. \n |
1028 | /// 0000: Bits [7:0] of parameter \a X are extracted. \n |
1029 | /// 0001: Bits [15:8] of the parameter \a X are extracted. \n |
1030 | /// 0010: Bits [23:16] of the parameter \a X are extracted. \n |
1031 | /// 0011: Bits [31:24] of the parameter \a X are extracted. \n |
1032 | /// 0100: Bits [39:32] of the parameter \a X are extracted. \n |
1033 | /// 0101: Bits [47:40] of the parameter \a X are extracted. \n |
1034 | /// 0110: Bits [55:48] of the parameter \a X are extracted. \n |
1035 | /// 0111: Bits [63:56] of the parameter \a X are extracted. \n |
1036 | /// 1000: Bits [71:64] of the parameter \a X are extracted. \n |
1037 | /// 1001: Bits [79:72] of the parameter \a X are extracted. \n |
1038 | /// 1010: Bits [87:80] of the parameter \a X are extracted. \n |
1039 | /// 1011: Bits [95:88] of the parameter \a X are extracted. \n |
1040 | /// 1100: Bits [103:96] of the parameter \a X are extracted. \n |
1041 | /// 1101: Bits [111:104] of the parameter \a X are extracted. \n |
1042 | /// 1110: Bits [119:112] of the parameter \a X are extracted. \n |
1043 | /// 1111: Bits [127:120] of the parameter \a X are extracted. |
1044 | /// \returns An unsigned integer, whose lower 8 bits are selected from the |
1045 | /// 128-bit integer vector parameter and the remaining bits are assigned |
1046 | /// zeros. |
1047 | #define _mm_extract_epi8(X, N) \ |
1048 | (int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ |
1049 | (int)(N)) |
1050 | |
1051 | /// Extracts a 32-bit element from the 128-bit integer vector of |
1052 | /// [4 x i32], using the immediate value parameter \a N as a selector. |
1053 | /// |
1054 | /// \headerfile <x86intrin.h> |
1055 | /// |
1056 | /// \code |
1057 | /// int _mm_extract_epi32(__m128i X, const int N); |
1058 | /// \endcode |
1059 | /// |
1060 | /// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction. |
1061 | /// |
1062 | /// \param X |
1063 | /// A 128-bit integer vector. |
1064 | /// \param N |
1065 | /// An immediate value. Bits [1:0] specify which 32-bit vector element from |
1066 | /// the argument \a X to extract and copy to the result. \n |
1067 | /// 00: Bits [31:0] of the parameter \a X are extracted. \n |
1068 | /// 01: Bits [63:32] of the parameter \a X are extracted. \n |
1069 | /// 10: Bits [95:64] of the parameter \a X are extracted. \n |
1070 | /// 11: Bits [127:96] of the parameter \a X are exracted. |
1071 | /// \returns An integer, whose lower 32 bits are selected from the 128-bit |
1072 | /// integer vector parameter and the remaining bits are assigned zeros. |
1073 | #define _mm_extract_epi32(X, N) \ |
1074 | (int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)) |
1075 | |
1076 | #ifdef __x86_64__ |
1077 | /// Extracts a 64-bit element from the 128-bit integer vector of |
1078 | /// [2 x i64], using the immediate value parameter \a N as a selector. |
1079 | /// |
1080 | /// \headerfile <x86intrin.h> |
1081 | /// |
1082 | /// \code |
1083 | /// long long _mm_extract_epi64(__m128i X, const int N); |
1084 | /// \endcode |
1085 | /// |
1086 | /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction. |
1087 | /// |
1088 | /// \param X |
1089 | /// A 128-bit integer vector. |
1090 | /// \param N |
1091 | /// An immediate value. Bit [0] specifies which 64-bit vector element from |
1092 | /// the argument \a X to return. \n |
1093 | /// 0: Bits [63:0] are returned. \n |
1094 | /// 1: Bits [127:64] are returned. \n |
1095 | /// \returns A 64-bit integer. |
1096 | #define _mm_extract_epi64(X, N) \ |
1097 | (long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)) |
1098 | #endif /* __x86_64 */ |
1099 | |
1100 | /* SSE4 128-bit Packed Integer Comparisons. */ |
1101 | /// Tests whether the specified bits in a 128-bit integer vector are all |
1102 | /// zeros. |
1103 | /// |
1104 | /// \headerfile <x86intrin.h> |
1105 | /// |
1106 | /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. |
1107 | /// |
1108 | /// \param __M |
1109 | /// A 128-bit integer vector containing the bits to be tested. |
1110 | /// \param __V |
1111 | /// A 128-bit integer vector selecting which bits to test in operand \a __M. |
1112 | /// \returns TRUE if the specified bits are all zeros; FALSE otherwise. |
1113 | static __inline__ int __DEFAULT_FN_ATTRS |
1114 | _mm_testz_si128(__m128i __M, __m128i __V) |
1115 | { |
1116 | return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); |
1117 | } |
1118 | |
1119 | /// Tests whether the specified bits in a 128-bit integer vector are all |
1120 | /// ones. |
1121 | /// |
1122 | /// \headerfile <x86intrin.h> |
1123 | /// |
1124 | /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. |
1125 | /// |
1126 | /// \param __M |
1127 | /// A 128-bit integer vector containing the bits to be tested. |
1128 | /// \param __V |
1129 | /// A 128-bit integer vector selecting which bits to test in operand \a __M. |
1130 | /// \returns TRUE if the specified bits are all ones; FALSE otherwise. |
1131 | static __inline__ int __DEFAULT_FN_ATTRS |
1132 | _mm_testc_si128(__m128i __M, __m128i __V) |
1133 | { |
1134 | return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); |
1135 | } |
1136 | |
1137 | /// Tests whether the specified bits in a 128-bit integer vector are |
1138 | /// neither all zeros nor all ones. |
1139 | /// |
1140 | /// \headerfile <x86intrin.h> |
1141 | /// |
1142 | /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. |
1143 | /// |
1144 | /// \param __M |
1145 | /// A 128-bit integer vector containing the bits to be tested. |
1146 | /// \param __V |
1147 | /// A 128-bit integer vector selecting which bits to test in operand \a __M. |
1148 | /// \returns TRUE if the specified bits are neither all zeros nor all ones; |
1149 | /// FALSE otherwise. |
1150 | static __inline__ int __DEFAULT_FN_ATTRS |
1151 | _mm_testnzc_si128(__m128i __M, __m128i __V) |
1152 | { |
1153 | return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); |
1154 | } |
1155 | |
1156 | /// Tests whether the specified bits in a 128-bit integer vector are all |
1157 | /// ones. |
1158 | /// |
1159 | /// \headerfile <x86intrin.h> |
1160 | /// |
1161 | /// \code |
1162 | /// int _mm_test_all_ones(__m128i V); |
1163 | /// \endcode |
1164 | /// |
1165 | /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. |
1166 | /// |
1167 | /// \param V |
1168 | /// A 128-bit integer vector containing the bits to be tested. |
1169 | /// \returns TRUE if the bits specified in the operand are all set to 1; FALSE |
1170 | /// otherwise. |
1171 | #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) |
1172 | |
1173 | /// Tests whether the specified bits in a 128-bit integer vector are |
1174 | /// neither all zeros nor all ones. |
1175 | /// |
1176 | /// \headerfile <x86intrin.h> |
1177 | /// |
1178 | /// \code |
1179 | /// int _mm_test_mix_ones_zeros(__m128i M, __m128i V); |
1180 | /// \endcode |
1181 | /// |
1182 | /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. |
1183 | /// |
1184 | /// \param M |
1185 | /// A 128-bit integer vector containing the bits to be tested. |
1186 | /// \param V |
1187 | /// A 128-bit integer vector selecting which bits to test in operand \a M. |
1188 | /// \returns TRUE if the specified bits are neither all zeros nor all ones; |
1189 | /// FALSE otherwise. |
1190 | #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) |
1191 | |
1192 | /// Tests whether the specified bits in a 128-bit integer vector are all |
1193 | /// zeros. |
1194 | /// |
1195 | /// \headerfile <x86intrin.h> |
1196 | /// |
1197 | /// \code |
1198 | /// int _mm_test_all_zeros(__m128i M, __m128i V); |
1199 | /// \endcode |
1200 | /// |
1201 | /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. |
1202 | /// |
1203 | /// \param M |
1204 | /// A 128-bit integer vector containing the bits to be tested. |
1205 | /// \param V |
1206 | /// A 128-bit integer vector selecting which bits to test in operand \a M. |
1207 | /// \returns TRUE if the specified bits are all zeros; FALSE otherwise. |
1208 | #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) |
1209 | |
1210 | /* SSE4 64-bit Packed Integer Comparisons. */ |
1211 | /// Compares each of the corresponding 64-bit values of the 128-bit |
1212 | /// integer vectors for equality. |
1213 | /// |
1214 | /// \headerfile <x86intrin.h> |
1215 | /// |
1216 | /// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction. |
1217 | /// |
1218 | /// \param __V1 |
1219 | /// A 128-bit integer vector. |
1220 | /// \param __V2 |
1221 | /// A 128-bit integer vector. |
1222 | /// \returns A 128-bit integer vector containing the comparison results. |
1223 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
1224 | _mm_cmpeq_epi64(__m128i __V1, __m128i __V2) |
1225 | { |
1226 | return (__m128i)((__v2di)__V1 == (__v2di)__V2); |
1227 | } |
1228 | |
1229 | /* SSE4 Packed Integer Sign-Extension. */ |
1230 | /// Sign-extends each of the lower eight 8-bit integer elements of a |
1231 | /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a |
1232 | /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector |
1233 | /// are unused. |
1234 | /// |
1235 | /// \headerfile <x86intrin.h> |
1236 | /// |
1237 | /// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction. |
1238 | /// |
1239 | /// \param __V |
1240 | /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign- |
1241 | /// extended to 16-bit values. |
1242 | /// \returns A 128-bit vector of [8 x i16] containing the sign-extended values. |
1243 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
1244 | _mm_cvtepi8_epi16(__m128i __V) |
1245 | { |
1246 | /* This function always performs a signed extension, but __v16qi is a char |
1247 | which may be signed or unsigned, so use __v16qs. */ |
1248 | return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); |
1249 | } |
1250 | |
1251 | /// Sign-extends each of the lower four 8-bit integer elements of a |
1252 | /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a |
1253 | /// 128-bit vector of [4 x i32]. The upper twelve elements of the input |
1254 | /// vector are unused. |
1255 | /// |
1256 | /// \headerfile <x86intrin.h> |
1257 | /// |
1258 | /// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction. |
1259 | /// |
1260 | /// \param __V |
1261 | /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are |
1262 | /// sign-extended to 32-bit values. |
1263 | /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. |
1264 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
1265 | _mm_cvtepi8_epi32(__m128i __V) |
1266 | { |
1267 | /* This function always performs a signed extension, but __v16qi is a char |
1268 | which may be signed or unsigned, so use __v16qs. */ |
1269 | return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); |
1270 | } |
1271 | |
1272 | /// Sign-extends each of the lower two 8-bit integer elements of a |
1273 | /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in |
1274 | /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input |
1275 | /// vector are unused. |
1276 | /// |
1277 | /// \headerfile <x86intrin.h> |
1278 | /// |
1279 | /// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction. |
1280 | /// |
1281 | /// \param __V |
1282 | /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are |
1283 | /// sign-extended to 64-bit values. |
1284 | /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. |
1285 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
1286 | _mm_cvtepi8_epi64(__m128i __V) |
1287 | { |
1288 | /* This function always performs a signed extension, but __v16qi is a char |
1289 | which may be signed or unsigned, so use __v16qs. */ |
1290 | return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); |
1291 | } |
1292 | |
1293 | /// Sign-extends each of the lower four 16-bit integer elements of a |
1294 | /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in |
1295 | /// a 128-bit vector of [4 x i32]. The upper four elements of the input |
1296 | /// vector are unused. |
1297 | /// |
1298 | /// \headerfile <x86intrin.h> |
1299 | /// |
1300 | /// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction. |
1301 | /// |
1302 | /// \param __V |
1303 | /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are |
1304 | /// sign-extended to 32-bit values. |
1305 | /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. |
1306 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
1307 | _mm_cvtepi16_epi32(__m128i __V) |
1308 | { |
1309 | return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); |
1310 | } |
1311 | |
1312 | /// Sign-extends each of the lower two 16-bit integer elements of a |
1313 | /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in |
1314 | /// a 128-bit vector of [2 x i64]. The upper six elements of the input |
1315 | /// vector are unused. |
1316 | /// |
1317 | /// \headerfile <x86intrin.h> |
1318 | /// |
1319 | /// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction. |
1320 | /// |
1321 | /// \param __V |
1322 | /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are |
1323 | /// sign-extended to 64-bit values. |
1324 | /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. |
1325 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
1326 | _mm_cvtepi16_epi64(__m128i __V) |
1327 | { |
1328 | return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); |
1329 | } |
1330 | |
1331 | /// Sign-extends each of the lower two 32-bit integer elements of a |
1332 | /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in |
1333 | /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector |
1334 | /// are unused. |
1335 | /// |
1336 | /// \headerfile <x86intrin.h> |
1337 | /// |
1338 | /// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction. |
1339 | /// |
1340 | /// \param __V |
1341 | /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are |
1342 | /// sign-extended to 64-bit values. |
1343 | /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. |
1344 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
1345 | _mm_cvtepi32_epi64(__m128i __V) |
1346 | { |
1347 | return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di); |
1348 | } |
1349 | |
1350 | /* SSE4 Packed Integer Zero-Extension. */ |
1351 | /// Zero-extends each of the lower eight 8-bit integer elements of a |
1352 | /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a |
1353 | /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector |
1354 | /// are unused. |
1355 | /// |
1356 | /// \headerfile <x86intrin.h> |
1357 | /// |
1358 | /// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction. |
1359 | /// |
1360 | /// \param __V |
1361 | /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are |
1362 | /// zero-extended to 16-bit values. |
1363 | /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. |
1364 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
1365 | _mm_cvtepu8_epi16(__m128i __V) |
1366 | { |
1367 | return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); |
1368 | } |
1369 | |
1370 | /// Zero-extends each of the lower four 8-bit integer elements of a |
1371 | /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a |
1372 | /// 128-bit vector of [4 x i32]. The upper twelve elements of the input |
1373 | /// vector are unused. |
1374 | /// |
1375 | /// \headerfile <x86intrin.h> |
1376 | /// |
1377 | /// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction. |
1378 | /// |
1379 | /// \param __V |
1380 | /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are |
1381 | /// zero-extended to 32-bit values. |
1382 | /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. |
1383 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
1384 | _mm_cvtepu8_epi32(__m128i __V) |
1385 | { |
1386 | return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); |
1387 | } |
1388 | |
1389 | /// Zero-extends each of the lower two 8-bit integer elements of a |
1390 | /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in |
1391 | /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input |
1392 | /// vector are unused. |
1393 | /// |
1394 | /// \headerfile <x86intrin.h> |
1395 | /// |
1396 | /// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction. |
1397 | /// |
1398 | /// \param __V |
1399 | /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are |
1400 | /// zero-extended to 64-bit values. |
1401 | /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. |
1402 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
1403 | _mm_cvtepu8_epi64(__m128i __V) |
1404 | { |
1405 | return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); |
1406 | } |
1407 | |
1408 | /// Zero-extends each of the lower four 16-bit integer elements of a |
1409 | /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in |
1410 | /// a 128-bit vector of [4 x i32]. The upper four elements of the input |
1411 | /// vector are unused. |
1412 | /// |
1413 | /// \headerfile <x86intrin.h> |
1414 | /// |
1415 | /// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction. |
1416 | /// |
1417 | /// \param __V |
1418 | /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are |
1419 | /// zero-extended to 32-bit values. |
1420 | /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. |
1421 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
1422 | _mm_cvtepu16_epi32(__m128i __V) |
1423 | { |
1424 | return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); |
1425 | } |
1426 | |
1427 | /// Zero-extends each of the lower two 16-bit integer elements of a |
1428 | /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in |
1429 | /// a 128-bit vector of [2 x i64]. The upper six elements of the input vector |
1430 | /// are unused. |
1431 | /// |
1432 | /// \headerfile <x86intrin.h> |
1433 | /// |
1434 | /// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction. |
1435 | /// |
1436 | /// \param __V |
1437 | /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are |
1438 | /// zero-extended to 64-bit values. |
1439 | /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. |
1440 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
1441 | _mm_cvtepu16_epi64(__m128i __V) |
1442 | { |
1443 | return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); |
1444 | } |
1445 | |
1446 | /// Zero-extends each of the lower two 32-bit integer elements of a |
1447 | /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in |
1448 | /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector |
1449 | /// are unused. |
1450 | /// |
1451 | /// \headerfile <x86intrin.h> |
1452 | /// |
1453 | /// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction. |
1454 | /// |
1455 | /// \param __V |
1456 | /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are |
1457 | /// zero-extended to 64-bit values. |
1458 | /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. |
1459 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
1460 | _mm_cvtepu32_epi64(__m128i __V) |
1461 | { |
1462 | return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di); |
1463 | } |
1464 | |
1465 | /* SSE4 Pack with Unsigned Saturation. */ |
1466 | /// Converts 32-bit signed integers from both 128-bit integer vector |
1467 | /// operands into 16-bit unsigned integers, and returns the packed result. |
1468 | /// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than |
1469 | /// 0x0000 are saturated to 0x0000. |
1470 | /// |
1471 | /// \headerfile <x86intrin.h> |
1472 | /// |
1473 | /// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction. |
1474 | /// |
1475 | /// \param __V1 |
1476 | /// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a |
1477 | /// signed integer and is converted to a 16-bit unsigned integer with |
1478 | /// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values |
1479 | /// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values |
1480 | /// are written to the lower 64 bits of the result. |
1481 | /// \param __V2 |
1482 | /// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a |
1483 | /// signed integer and is converted to a 16-bit unsigned integer with |
1484 | /// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values |
1485 | /// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values |
1486 | /// are written to the higher 64 bits of the result. |
1487 | /// \returns A 128-bit vector of [8 x i16] containing the converted values. |
1488 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
1489 | _mm_packus_epi32(__m128i __V1, __m128i __V2) |
1490 | { |
1491 | return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); |
1492 | } |
1493 | |
1494 | /* SSE4 Multiple Packed Sums of Absolute Difference. */ |
1495 | /// Subtracts 8-bit unsigned integer values and computes the absolute |
1496 | /// values of the differences to the corresponding bits in the destination. |
1497 | /// Then sums of the absolute differences are returned according to the bit |
1498 | /// fields in the immediate operand. |
1499 | /// |
1500 | /// \headerfile <x86intrin.h> |
1501 | /// |
1502 | /// \code |
1503 | /// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M); |
1504 | /// \endcode |
1505 | /// |
1506 | /// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction. |
1507 | /// |
1508 | /// \param X |
1509 | /// A 128-bit vector of [16 x i8]. |
1510 | /// \param Y |
1511 | /// A 128-bit vector of [16 x i8]. |
1512 | /// \param M |
1513 | /// An 8-bit immediate operand specifying how the absolute differences are to |
1514 | /// be calculated, according to the following algorithm: |
1515 | /// \code |
1516 | /// // M2 represents bit 2 of the immediate operand |
1517 | /// // M10 represents bits [1:0] of the immediate operand |
1518 | /// i = M2 * 4; |
1519 | /// j = M10 * 4; |
1520 | /// for (k = 0; k < 8; k = k + 1) { |
1521 | /// d0 = abs(X[i + k + 0] - Y[j + 0]); |
1522 | /// d1 = abs(X[i + k + 1] - Y[j + 1]); |
1523 | /// d2 = abs(X[i + k + 2] - Y[j + 2]); |
1524 | /// d3 = abs(X[i + k + 3] - Y[j + 3]); |
1525 | /// r[k] = d0 + d1 + d2 + d3; |
1526 | /// } |
1527 | /// \endcode |
1528 | /// \returns A 128-bit integer vector containing the sums of the sets of |
1529 | /// absolute differences between both operands. |
1530 | #define _mm_mpsadbw_epu8(X, Y, M) \ |
1531 | (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ |
1532 | (__v16qi)(__m128i)(Y), (M)) |
1533 | |
1534 | /// Finds the minimum unsigned 16-bit element in the input 128-bit |
1535 | /// vector of [8 x u16] and returns it and along with its index. |
1536 | /// |
1537 | /// \headerfile <x86intrin.h> |
1538 | /// |
1539 | /// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c> |
1540 | /// instruction. |
1541 | /// |
1542 | /// \param __V |
1543 | /// A 128-bit vector of [8 x u16]. |
1544 | /// \returns A 128-bit value where bits [15:0] contain the minimum value found |
1545 | /// in parameter \a __V, bits [18:16] contain the index of the minimum value |
1546 | /// and the remaining bits are set to 0. |
1547 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
1548 | _mm_minpos_epu16(__m128i __V) |
1549 | { |
1550 | return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V); |
1551 | } |
1552 | |
1553 | /* Handle the sse4.2 definitions here. */ |
1554 | |
1555 | /* These definitions are normally in nmmintrin.h, but gcc puts them in here |
1556 | so we'll do the same. */ |
1557 | |
1558 | #undef __DEFAULT_FN_ATTRS |
1559 | #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) |
1560 | |
1561 | /* These specify the type of data that we're comparing. */ |
1562 | #define _SIDD_UBYTE_OPS 0x00 |
1563 | #define _SIDD_UWORD_OPS 0x01 |
1564 | #define _SIDD_SBYTE_OPS 0x02 |
1565 | #define _SIDD_SWORD_OPS 0x03 |
1566 | |
1567 | /* These specify the type of comparison operation. */ |
1568 | #define _SIDD_CMP_EQUAL_ANY 0x00 |
1569 | #define _SIDD_CMP_RANGES 0x04 |
1570 | #define _SIDD_CMP_EQUAL_EACH 0x08 |
1571 | #define _SIDD_CMP_EQUAL_ORDERED 0x0c |
1572 | |
1573 | /* These macros specify the polarity of the operation. */ |
1574 | #define _SIDD_POSITIVE_POLARITY 0x00 |
1575 | #define _SIDD_NEGATIVE_POLARITY 0x10 |
1576 | #define _SIDD_MASKED_POSITIVE_POLARITY 0x20 |
1577 | #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 |
1578 | |
1579 | /* These macros are used in _mm_cmpXstri() to specify the return. */ |
1580 | #define _SIDD_LEAST_SIGNIFICANT 0x00 |
1581 | #define _SIDD_MOST_SIGNIFICANT 0x40 |
1582 | |
1583 | /* These macros are used in _mm_cmpXstri() to specify the return. */ |
1584 | #define _SIDD_BIT_MASK 0x00 |
1585 | #define _SIDD_UNIT_MASK 0x40 |
1586 | |
1587 | /* SSE4.2 Packed Comparison Intrinsics. */ |
1588 | /// Uses the immediate operand \a M to perform a comparison of string |
1589 | /// data with implicitly defined lengths that is contained in source operands |
1590 | /// \a A and \a B. Returns a 128-bit integer vector representing the result |
1591 | /// mask of the comparison. |
1592 | /// |
1593 | /// \headerfile <x86intrin.h> |
1594 | /// |
1595 | /// \code |
1596 | /// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M); |
1597 | /// \endcode |
1598 | /// |
1599 | /// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c> |
1600 | /// instruction. |
1601 | /// |
1602 | /// \param A |
1603 | /// A 128-bit integer vector containing one of the source operands to be |
1604 | /// compared. |
1605 | /// \param B |
1606 | /// A 128-bit integer vector containing one of the source operands to be |
1607 | /// compared. |
1608 | /// \param M |
1609 | /// An 8-bit immediate operand specifying whether the characters are bytes or |
1610 | /// words, the type of comparison to perform, and the format of the return |
1611 | /// value. \n |
1612 | /// Bits [1:0]: Determine source data format. \n |
1613 | /// 00: 16 unsigned bytes \n |
1614 | /// 01: 8 unsigned words \n |
1615 | /// 10: 16 signed bytes \n |
1616 | /// 11: 8 signed words \n |
1617 | /// Bits [3:2]: Determine comparison type and aggregation method. \n |
1618 | /// 00: Subset: Each character in \a B is compared for equality with all |
1619 | /// the characters in \a A. \n |
1620 | /// 01: Ranges: Each character in \a B is compared to \a A. The comparison |
1621 | /// basis is greater than or equal for even-indexed elements in \a A, |
1622 | /// and less than or equal for odd-indexed elements in \a A. \n |
1623 | /// 10: Match: Compare each pair of corresponding characters in \a A and |
1624 | /// \a B for equality. \n |
1625 | /// 11: Substring: Search \a B for substring matches of \a A. \n |
1626 | /// Bits [5:4]: Determine whether to perform a one's complement on the bit |
1627 | /// mask of the comparison results. \n |
1628 | /// 00: No effect. \n |
1629 | /// 01: Negate the bit mask. \n |
1630 | /// 10: No effect. \n |
1631 | /// 11: Negate the bit mask only for bits with an index less than or equal |
1632 | /// to the size of \a A or \a B. \n |
1633 | /// Bit [6]: Determines whether the result is zero-extended or expanded to 16 |
1634 | /// bytes. \n |
1635 | /// 0: The result is zero-extended to 16 bytes. \n |
1636 | /// 1: The result is expanded to 16 bytes (this expansion is performed by |
1637 | /// repeating each bit 8 or 16 times). |
1638 | /// \returns Returns a 128-bit integer vector representing the result mask of |
1639 | /// the comparison. |
1640 | #define _mm_cmpistrm(A, B, M) \ |
1641 | (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ |
1642 | (__v16qi)(__m128i)(B), (int)(M)) |
1643 | |
1644 | /// Uses the immediate operand \a M to perform a comparison of string |
1645 | /// data with implicitly defined lengths that is contained in source operands |
1646 | /// \a A and \a B. Returns an integer representing the result index of the |
1647 | /// comparison. |
1648 | /// |
1649 | /// \headerfile <x86intrin.h> |
1650 | /// |
1651 | /// \code |
1652 | /// int _mm_cmpistri(__m128i A, __m128i B, const int M); |
1653 | /// \endcode |
1654 | /// |
1655 | /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> |
1656 | /// instruction. |
1657 | /// |
1658 | /// \param A |
1659 | /// A 128-bit integer vector containing one of the source operands to be |
1660 | /// compared. |
1661 | /// \param B |
1662 | /// A 128-bit integer vector containing one of the source operands to be |
1663 | /// compared. |
1664 | /// \param M |
1665 | /// An 8-bit immediate operand specifying whether the characters are bytes or |
1666 | /// words, the type of comparison to perform, and the format of the return |
1667 | /// value. \n |
1668 | /// Bits [1:0]: Determine source data format. \n |
1669 | /// 00: 16 unsigned bytes \n |
1670 | /// 01: 8 unsigned words \n |
1671 | /// 10: 16 signed bytes \n |
1672 | /// 11: 8 signed words \n |
1673 | /// Bits [3:2]: Determine comparison type and aggregation method. \n |
1674 | /// 00: Subset: Each character in \a B is compared for equality with all |
1675 | /// the characters in \a A. \n |
1676 | /// 01: Ranges: Each character in \a B is compared to \a A. The comparison |
1677 | /// basis is greater than or equal for even-indexed elements in \a A, |
1678 | /// and less than or equal for odd-indexed elements in \a A. \n |
1679 | /// 10: Match: Compare each pair of corresponding characters in \a A and |
1680 | /// \a B for equality. \n |
1681 | /// 11: Substring: Search B for substring matches of \a A. \n |
1682 | /// Bits [5:4]: Determine whether to perform a one's complement on the bit |
1683 | /// mask of the comparison results. \n |
1684 | /// 00: No effect. \n |
1685 | /// 01: Negate the bit mask. \n |
1686 | /// 10: No effect. \n |
1687 | /// 11: Negate the bit mask only for bits with an index less than or equal |
1688 | /// to the size of \a A or \a B. \n |
1689 | /// Bit [6]: Determines whether the index of the lowest set bit or the |
1690 | /// highest set bit is returned. \n |
1691 | /// 0: The index of the least significant set bit. \n |
1692 | /// 1: The index of the most significant set bit. \n |
1693 | /// \returns Returns an integer representing the result index of the comparison. |
1694 | #define _mm_cmpistri(A, B, M) \ |
1695 | (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ |
1696 | (__v16qi)(__m128i)(B), (int)(M)) |
1697 | |
1698 | /// Uses the immediate operand \a M to perform a comparison of string |
1699 | /// data with explicitly defined lengths that is contained in source operands |
1700 | /// \a A and \a B. Returns a 128-bit integer vector representing the result |
1701 | /// mask of the comparison. |
1702 | /// |
1703 | /// \headerfile <x86intrin.h> |
1704 | /// |
1705 | /// \code |
1706 | /// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M); |
1707 | /// \endcode |
1708 | /// |
1709 | /// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c> |
1710 | /// instruction. |
1711 | /// |
1712 | /// \param A |
1713 | /// A 128-bit integer vector containing one of the source operands to be |
1714 | /// compared. |
1715 | /// \param LA |
1716 | /// An integer that specifies the length of the string in \a A. |
1717 | /// \param B |
1718 | /// A 128-bit integer vector containing one of the source operands to be |
1719 | /// compared. |
1720 | /// \param LB |
1721 | /// An integer that specifies the length of the string in \a B. |
1722 | /// \param M |
1723 | /// An 8-bit immediate operand specifying whether the characters are bytes or |
1724 | /// words, the type of comparison to perform, and the format of the return |
1725 | /// value. \n |
1726 | /// Bits [1:0]: Determine source data format. \n |
1727 | /// 00: 16 unsigned bytes \n |
1728 | /// 01: 8 unsigned words \n |
1729 | /// 10: 16 signed bytes \n |
1730 | /// 11: 8 signed words \n |
1731 | /// Bits [3:2]: Determine comparison type and aggregation method. \n |
1732 | /// 00: Subset: Each character in \a B is compared for equality with all |
1733 | /// the characters in \a A. \n |
1734 | /// 01: Ranges: Each character in \a B is compared to \a A. The comparison |
1735 | /// basis is greater than or equal for even-indexed elements in \a A, |
1736 | /// and less than or equal for odd-indexed elements in \a A. \n |
1737 | /// 10: Match: Compare each pair of corresponding characters in \a A and |
1738 | /// \a B for equality. \n |
1739 | /// 11: Substring: Search \a B for substring matches of \a A. \n |
1740 | /// Bits [5:4]: Determine whether to perform a one's complement on the bit |
1741 | /// mask of the comparison results. \n |
1742 | /// 00: No effect. \n |
1743 | /// 01: Negate the bit mask. \n |
1744 | /// 10: No effect. \n |
1745 | /// 11: Negate the bit mask only for bits with an index less than or equal |
1746 | /// to the size of \a A or \a B. \n |
1747 | /// Bit [6]: Determines whether the result is zero-extended or expanded to 16 |
1748 | /// bytes. \n |
1749 | /// 0: The result is zero-extended to 16 bytes. \n |
1750 | /// 1: The result is expanded to 16 bytes (this expansion is performed by |
1751 | /// repeating each bit 8 or 16 times). \n |
1752 | /// \returns Returns a 128-bit integer vector representing the result mask of |
1753 | /// the comparison. |
1754 | #define _mm_cmpestrm(A, LA, B, LB, M) \ |
1755 | (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ |
1756 | (__v16qi)(__m128i)(B), (int)(LB), \ |
1757 | (int)(M)) |
1758 | |
1759 | /// Uses the immediate operand \a M to perform a comparison of string |
1760 | /// data with explicitly defined lengths that is contained in source operands |
1761 | /// \a A and \a B. Returns an integer representing the result index of the |
1762 | /// comparison. |
1763 | /// |
1764 | /// \headerfile <x86intrin.h> |
1765 | /// |
1766 | /// \code |
1767 | /// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M); |
1768 | /// \endcode |
1769 | /// |
1770 | /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> |
1771 | /// instruction. |
1772 | /// |
1773 | /// \param A |
1774 | /// A 128-bit integer vector containing one of the source operands to be |
1775 | /// compared. |
1776 | /// \param LA |
1777 | /// An integer that specifies the length of the string in \a A. |
1778 | /// \param B |
1779 | /// A 128-bit integer vector containing one of the source operands to be |
1780 | /// compared. |
1781 | /// \param LB |
1782 | /// An integer that specifies the length of the string in \a B. |
1783 | /// \param M |
1784 | /// An 8-bit immediate operand specifying whether the characters are bytes or |
1785 | /// words, the type of comparison to perform, and the format of the return |
1786 | /// value. \n |
1787 | /// Bits [1:0]: Determine source data format. \n |
1788 | /// 00: 16 unsigned bytes \n |
1789 | /// 01: 8 unsigned words \n |
1790 | /// 10: 16 signed bytes \n |
1791 | /// 11: 8 signed words \n |
1792 | /// Bits [3:2]: Determine comparison type and aggregation method. \n |
1793 | /// 00: Subset: Each character in \a B is compared for equality with all |
1794 | /// the characters in \a A. \n |
1795 | /// 01: Ranges: Each character in \a B is compared to \a A. The comparison |
1796 | /// basis is greater than or equal for even-indexed elements in \a A, |
1797 | /// and less than or equal for odd-indexed elements in \a A. \n |
1798 | /// 10: Match: Compare each pair of corresponding characters in \a A and |
1799 | /// \a B for equality. \n |
1800 | /// 11: Substring: Search B for substring matches of \a A. \n |
1801 | /// Bits [5:4]: Determine whether to perform a one's complement on the bit |
1802 | /// mask of the comparison results. \n |
1803 | /// 00: No effect. \n |
1804 | /// 01: Negate the bit mask. \n |
1805 | /// 10: No effect. \n |
1806 | /// 11: Negate the bit mask only for bits with an index less than or equal |
1807 | /// to the size of \a A or \a B. \n |
1808 | /// Bit [6]: Determines whether the index of the lowest set bit or the |
1809 | /// highest set bit is returned. \n |
1810 | /// 0: The index of the least significant set bit. \n |
1811 | /// 1: The index of the most significant set bit. \n |
1812 | /// \returns Returns an integer representing the result index of the comparison. |
1813 | #define _mm_cmpestri(A, LA, B, LB, M) \ |
1814 | (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ |
1815 | (__v16qi)(__m128i)(B), (int)(LB), \ |
1816 | (int)(M)) |
1817 | |
1818 | /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ |
1819 | /// Uses the immediate operand \a M to perform a comparison of string |
1820 | /// data with implicitly defined lengths that is contained in source operands |
1821 | /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the |
1822 | /// string in \a B is the maximum, otherwise, returns 0. |
1823 | /// |
1824 | /// \headerfile <x86intrin.h> |
1825 | /// |
1826 | /// \code |
1827 | /// int _mm_cmpistra(__m128i A, __m128i B, const int M); |
1828 | /// \endcode |
1829 | /// |
1830 | /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> |
1831 | /// instruction. |
1832 | /// |
1833 | /// \param A |
1834 | /// A 128-bit integer vector containing one of the source operands to be |
1835 | /// compared. |
1836 | /// \param B |
1837 | /// A 128-bit integer vector containing one of the source operands to be |
1838 | /// compared. |
1839 | /// \param M |
1840 | /// An 8-bit immediate operand specifying whether the characters are bytes or |
1841 | /// words and the type of comparison to perform. \n |
1842 | /// Bits [1:0]: Determine source data format. \n |
1843 | /// 00: 16 unsigned bytes \n |
1844 | /// 01: 8 unsigned words \n |
1845 | /// 10: 16 signed bytes \n |
1846 | /// 11: 8 signed words \n |
1847 | /// Bits [3:2]: Determine comparison type and aggregation method. \n |
1848 | /// 00: Subset: Each character in \a B is compared for equality with all |
1849 | /// the characters in \a A. \n |
1850 | /// 01: Ranges: Each character in \a B is compared to \a A. The comparison |
1851 | /// basis is greater than or equal for even-indexed elements in \a A, |
1852 | /// and less than or equal for odd-indexed elements in \a A. \n |
1853 | /// 10: Match: Compare each pair of corresponding characters in \a A and |
1854 | /// \a B for equality. \n |
1855 | /// 11: Substring: Search \a B for substring matches of \a A. \n |
1856 | /// Bits [5:4]: Determine whether to perform a one's complement on the bit |
1857 | /// mask of the comparison results. \n |
1858 | /// 00: No effect. \n |
1859 | /// 01: Negate the bit mask. \n |
1860 | /// 10: No effect. \n |
1861 | /// 11: Negate the bit mask only for bits with an index less than or equal |
1862 | /// to the size of \a A or \a B. \n |
1863 | /// \returns Returns 1 if the bit mask is zero and the length of the string in |
1864 | /// \a B is the maximum; otherwise, returns 0. |
1865 | #define _mm_cmpistra(A, B, M) \ |
1866 | (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ |
1867 | (__v16qi)(__m128i)(B), (int)(M)) |
1868 | |
1869 | /// Uses the immediate operand \a M to perform a comparison of string |
1870 | /// data with implicitly defined lengths that is contained in source operands |
1871 | /// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns |
1872 | /// 0. |
1873 | /// |
1874 | /// \headerfile <x86intrin.h> |
1875 | /// |
1876 | /// \code |
1877 | /// int _mm_cmpistrc(__m128i A, __m128i B, const int M); |
1878 | /// \endcode |
1879 | /// |
1880 | /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> |
1881 | /// instruction. |
1882 | /// |
1883 | /// \param A |
1884 | /// A 128-bit integer vector containing one of the source operands to be |
1885 | /// compared. |
1886 | /// \param B |
1887 | /// A 128-bit integer vector containing one of the source operands to be |
1888 | /// compared. |
1889 | /// \param M |
1890 | /// An 8-bit immediate operand specifying whether the characters are bytes or |
1891 | /// words and the type of comparison to perform. \n |
1892 | /// Bits [1:0]: Determine source data format. \n |
1893 | /// 00: 16 unsigned bytes \n |
1894 | /// 01: 8 unsigned words \n |
1895 | /// 10: 16 signed bytes \n |
1896 | /// 11: 8 signed words \n |
1897 | /// Bits [3:2]: Determine comparison type and aggregation method. \n |
1898 | /// 00: Subset: Each character in \a B is compared for equality with all |
1899 | /// the characters in \a A. \n |
1900 | /// 01: Ranges: Each character in \a B is compared to \a A. The comparison |
1901 | /// basis is greater than or equal for even-indexed elements in \a A, |
1902 | /// and less than or equal for odd-indexed elements in \a A. \n |
1903 | /// 10: Match: Compare each pair of corresponding characters in \a A and |
1904 | /// \a B for equality. \n |
1905 | /// 11: Substring: Search B for substring matches of \a A. \n |
1906 | /// Bits [5:4]: Determine whether to perform a one's complement on the bit |
1907 | /// mask of the comparison results. \n |
1908 | /// 00: No effect. \n |
1909 | /// 01: Negate the bit mask. \n |
1910 | /// 10: No effect. \n |
1911 | /// 11: Negate the bit mask only for bits with an index less than or equal |
1912 | /// to the size of \a A or \a B. |
1913 | /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0. |
1914 | #define _mm_cmpistrc(A, B, M) \ |
1915 | (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ |
1916 | (__v16qi)(__m128i)(B), (int)(M)) |
1917 | |
1918 | /// Uses the immediate operand \a M to perform a comparison of string |
1919 | /// data with implicitly defined lengths that is contained in source operands |
1920 | /// \a A and \a B. Returns bit 0 of the resulting bit mask. |
1921 | /// |
1922 | /// \headerfile <x86intrin.h> |
1923 | /// |
1924 | /// \code |
1925 | /// int _mm_cmpistro(__m128i A, __m128i B, const int M); |
1926 | /// \endcode |
1927 | /// |
1928 | /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> |
1929 | /// instruction. |
1930 | /// |
1931 | /// \param A |
1932 | /// A 128-bit integer vector containing one of the source operands to be |
1933 | /// compared. |
1934 | /// \param B |
1935 | /// A 128-bit integer vector containing one of the source operands to be |
1936 | /// compared. |
1937 | /// \param M |
1938 | /// An 8-bit immediate operand specifying whether the characters are bytes or |
1939 | /// words and the type of comparison to perform. \n |
1940 | /// Bits [1:0]: Determine source data format. \n |
1941 | /// 00: 16 unsigned bytes \n |
1942 | /// 01: 8 unsigned words \n |
1943 | /// 10: 16 signed bytes \n |
1944 | /// 11: 8 signed words \n |
1945 | /// Bits [3:2]: Determine comparison type and aggregation method. \n |
1946 | /// 00: Subset: Each character in \a B is compared for equality with all |
1947 | /// the characters in \a A. \n |
1948 | /// 01: Ranges: Each character in \a B is compared to \a A. The comparison |
1949 | /// basis is greater than or equal for even-indexed elements in \a A, |
1950 | /// and less than or equal for odd-indexed elements in \a A. \n |
1951 | /// 10: Match: Compare each pair of corresponding characters in \a A and |
1952 | /// \a B for equality. \n |
1953 | /// 11: Substring: Search B for substring matches of \a A. \n |
1954 | /// Bits [5:4]: Determine whether to perform a one's complement on the bit |
1955 | /// mask of the comparison results. \n |
1956 | /// 00: No effect. \n |
1957 | /// 01: Negate the bit mask. \n |
1958 | /// 10: No effect. \n |
1959 | /// 11: Negate the bit mask only for bits with an index less than or equal |
1960 | /// to the size of \a A or \a B. \n |
1961 | /// \returns Returns bit 0 of the resulting bit mask. |
1962 | #define _mm_cmpistro(A, B, M) \ |
1963 | (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ |
1964 | (__v16qi)(__m128i)(B), (int)(M)) |
1965 | |
1966 | /// Uses the immediate operand \a M to perform a comparison of string |
1967 | /// data with implicitly defined lengths that is contained in source operands |
1968 | /// \a A and \a B. Returns 1 if the length of the string in \a A is less than |
1969 | /// the maximum, otherwise, returns 0. |
1970 | /// |
1971 | /// \headerfile <x86intrin.h> |
1972 | /// |
1973 | /// \code |
1974 | /// int _mm_cmpistrs(__m128i A, __m128i B, const int M); |
1975 | /// \endcode |
1976 | /// |
1977 | /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> |
1978 | /// instruction. |
1979 | /// |
1980 | /// \param A |
1981 | /// A 128-bit integer vector containing one of the source operands to be |
1982 | /// compared. |
1983 | /// \param B |
1984 | /// A 128-bit integer vector containing one of the source operands to be |
1985 | /// compared. |
1986 | /// \param M |
1987 | /// An 8-bit immediate operand specifying whether the characters are bytes or |
1988 | /// words and the type of comparison to perform. \n |
1989 | /// Bits [1:0]: Determine source data format. \n |
1990 | /// 00: 16 unsigned bytes \n |
1991 | /// 01: 8 unsigned words \n |
1992 | /// 10: 16 signed bytes \n |
1993 | /// 11: 8 signed words \n |
1994 | /// Bits [3:2]: Determine comparison type and aggregation method. \n |
1995 | /// 00: Subset: Each character in \a B is compared for equality with all |
1996 | /// the characters in \a A. \n |
1997 | /// 01: Ranges: Each character in \a B is compared to \a A. The comparison |
1998 | /// basis is greater than or equal for even-indexed elements in \a A, |
1999 | /// and less than or equal for odd-indexed elements in \a A. \n |
2000 | /// 10: Match: Compare each pair of corresponding characters in \a A and |
2001 | /// \a B for equality. \n |
2002 | /// 11: Substring: Search \a B for substring matches of \a A. \n |
2003 | /// Bits [5:4]: Determine whether to perform a one's complement on the bit |
2004 | /// mask of the comparison results. \n |
2005 | /// 00: No effect. \n |
2006 | /// 01: Negate the bit mask. \n |
2007 | /// 10: No effect. \n |
2008 | /// 11: Negate the bit mask only for bits with an index less than or equal |
2009 | /// to the size of \a A or \a B. \n |
2010 | /// \returns Returns 1 if the length of the string in \a A is less than the |
2011 | /// maximum, otherwise, returns 0. |
2012 | #define _mm_cmpistrs(A, B, M) \ |
2013 | (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ |
2014 | (__v16qi)(__m128i)(B), (int)(M)) |
2015 | |
2016 | /// Uses the immediate operand \a M to perform a comparison of string |
2017 | /// data with implicitly defined lengths that is contained in source operands |
2018 | /// \a A and \a B. Returns 1 if the length of the string in \a B is less than |
2019 | /// the maximum, otherwise, returns 0. |
2020 | /// |
2021 | /// \headerfile <x86intrin.h> |
2022 | /// |
2023 | /// \code |
2024 | /// int _mm_cmpistrz(__m128i A, __m128i B, const int M); |
2025 | /// \endcode |
2026 | /// |
2027 | /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> |
2028 | /// instruction. |
2029 | /// |
2030 | /// \param A |
2031 | /// A 128-bit integer vector containing one of the source operands to be |
2032 | /// compared. |
2033 | /// \param B |
2034 | /// A 128-bit integer vector containing one of the source operands to be |
2035 | /// compared. |
2036 | /// \param M |
2037 | /// An 8-bit immediate operand specifying whether the characters are bytes or |
2038 | /// words and the type of comparison to perform. \n |
2039 | /// Bits [1:0]: Determine source data format. \n |
2040 | /// 00: 16 unsigned bytes \n |
2041 | /// 01: 8 unsigned words \n |
2042 | /// 10: 16 signed bytes \n |
2043 | /// 11: 8 signed words \n |
2044 | /// Bits [3:2]: Determine comparison type and aggregation method. \n |
2045 | /// 00: Subset: Each character in \a B is compared for equality with all |
2046 | /// the characters in \a A. \n |
2047 | /// 01: Ranges: Each character in \a B is compared to \a A. The comparison |
2048 | /// basis is greater than or equal for even-indexed elements in \a A, |
2049 | /// and less than or equal for odd-indexed elements in \a A. \n |
2050 | /// 10: Match: Compare each pair of corresponding characters in \a A and |
2051 | /// \a B for equality. \n |
2052 | /// 11: Substring: Search \a B for substring matches of \a A. \n |
2053 | /// Bits [5:4]: Determine whether to perform a one's complement on the bit |
2054 | /// mask of the comparison results. \n |
2055 | /// 00: No effect. \n |
2056 | /// 01: Negate the bit mask. \n |
2057 | /// 10: No effect. \n |
2058 | /// 11: Negate the bit mask only for bits with an index less than or equal |
2059 | /// to the size of \a A or \a B. |
2060 | /// \returns Returns 1 if the length of the string in \a B is less than the |
2061 | /// maximum, otherwise, returns 0. |
2062 | #define _mm_cmpistrz(A, B, M) \ |
2063 | (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ |
2064 | (__v16qi)(__m128i)(B), (int)(M)) |
2065 | |
2066 | /// Uses the immediate operand \a M to perform a comparison of string |
2067 | /// data with explicitly defined lengths that is contained in source operands |
2068 | /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the |
2069 | /// string in \a B is the maximum, otherwise, returns 0. |
2070 | /// |
2071 | /// \headerfile <x86intrin.h> |
2072 | /// |
2073 | /// \code |
2074 | /// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M); |
2075 | /// \endcode |
2076 | /// |
2077 | /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> |
2078 | /// instruction. |
2079 | /// |
2080 | /// \param A |
2081 | /// A 128-bit integer vector containing one of the source operands to be |
2082 | /// compared. |
2083 | /// \param LA |
2084 | /// An integer that specifies the length of the string in \a A. |
2085 | /// \param B |
2086 | /// A 128-bit integer vector containing one of the source operands to be |
2087 | /// compared. |
2088 | /// \param LB |
2089 | /// An integer that specifies the length of the string in \a B. |
2090 | /// \param M |
2091 | /// An 8-bit immediate operand specifying whether the characters are bytes or |
2092 | /// words and the type of comparison to perform. \n |
2093 | /// Bits [1:0]: Determine source data format. \n |
2094 | /// 00: 16 unsigned bytes \n |
2095 | /// 01: 8 unsigned words \n |
2096 | /// 10: 16 signed bytes \n |
2097 | /// 11: 8 signed words \n |
2098 | /// Bits [3:2]: Determine comparison type and aggregation method. \n |
2099 | /// 00: Subset: Each character in \a B is compared for equality with all |
2100 | /// the characters in \a A. \n |
2101 | /// 01: Ranges: Each character in \a B is compared to \a A. The comparison |
2102 | /// basis is greater than or equal for even-indexed elements in \a A, |
2103 | /// and less than or equal for odd-indexed elements in \a A. \n |
2104 | /// 10: Match: Compare each pair of corresponding characters in \a A and |
2105 | /// \a B for equality. \n |
2106 | /// 11: Substring: Search \a B for substring matches of \a A. \n |
2107 | /// Bits [5:4]: Determine whether to perform a one's complement on the bit |
2108 | /// mask of the comparison results. \n |
2109 | /// 00: No effect. \n |
2110 | /// 01: Negate the bit mask. \n |
2111 | /// 10: No effect. \n |
2112 | /// 11: Negate the bit mask only for bits with an index less than or equal |
2113 | /// to the size of \a A or \a B. |
2114 | /// \returns Returns 1 if the bit mask is zero and the length of the string in |
2115 | /// \a B is the maximum, otherwise, returns 0. |
2116 | #define _mm_cmpestra(A, LA, B, LB, M) \ |
2117 | (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ |
2118 | (__v16qi)(__m128i)(B), (int)(LB), \ |
2119 | (int)(M)) |
2120 | |
2121 | /// Uses the immediate operand \a M to perform a comparison of string |
2122 | /// data with explicitly defined lengths that is contained in source operands |
2123 | /// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise, |
2124 | /// returns 0. |
2125 | /// |
2126 | /// \headerfile <x86intrin.h> |
2127 | /// |
2128 | /// \code |
2129 | /// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M); |
2130 | /// \endcode |
2131 | /// |
2132 | /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> |
2133 | /// instruction. |
2134 | /// |
2135 | /// \param A |
2136 | /// A 128-bit integer vector containing one of the source operands to be |
2137 | /// compared. |
2138 | /// \param LA |
2139 | /// An integer that specifies the length of the string in \a A. |
2140 | /// \param B |
2141 | /// A 128-bit integer vector containing one of the source operands to be |
2142 | /// compared. |
2143 | /// \param LB |
2144 | /// An integer that specifies the length of the string in \a B. |
2145 | /// \param M |
2146 | /// An 8-bit immediate operand specifying whether the characters are bytes or |
2147 | /// words and the type of comparison to perform. \n |
2148 | /// Bits [1:0]: Determine source data format. \n |
2149 | /// 00: 16 unsigned bytes \n |
2150 | /// 01: 8 unsigned words \n |
2151 | /// 10: 16 signed bytes \n |
2152 | /// 11: 8 signed words \n |
2153 | /// Bits [3:2]: Determine comparison type and aggregation method. \n |
2154 | /// 00: Subset: Each character in \a B is compared for equality with all |
2155 | /// the characters in \a A. \n |
2156 | /// 01: Ranges: Each character in \a B is compared to \a A. The comparison |
2157 | /// basis is greater than or equal for even-indexed elements in \a A, |
2158 | /// and less than or equal for odd-indexed elements in \a A. \n |
2159 | /// 10: Match: Compare each pair of corresponding characters in \a A and |
2160 | /// \a B for equality. \n |
2161 | /// 11: Substring: Search \a B for substring matches of \a A. \n |
2162 | /// Bits [5:4]: Determine whether to perform a one's complement on the bit |
2163 | /// mask of the comparison results. \n |
2164 | /// 00: No effect. \n |
2165 | /// 01: Negate the bit mask. \n |
2166 | /// 10: No effect. \n |
2167 | /// 11: Negate the bit mask only for bits with an index less than or equal |
2168 | /// to the size of \a A or \a B. \n |
2169 | /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0. |
2170 | #define _mm_cmpestrc(A, LA, B, LB, M) \ |
2171 | (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ |
2172 | (__v16qi)(__m128i)(B), (int)(LB), \ |
2173 | (int)(M)) |
2174 | |
2175 | /// Uses the immediate operand \a M to perform a comparison of string |
2176 | /// data with explicitly defined lengths that is contained in source operands |
2177 | /// \a A and \a B. Returns bit 0 of the resulting bit mask. |
2178 | /// |
2179 | /// \headerfile <x86intrin.h> |
2180 | /// |
2181 | /// \code |
2182 | /// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M); |
2183 | /// \endcode |
2184 | /// |
2185 | /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> |
2186 | /// instruction. |
2187 | /// |
2188 | /// \param A |
2189 | /// A 128-bit integer vector containing one of the source operands to be |
2190 | /// compared. |
2191 | /// \param LA |
2192 | /// An integer that specifies the length of the string in \a A. |
2193 | /// \param B |
2194 | /// A 128-bit integer vector containing one of the source operands to be |
2195 | /// compared. |
2196 | /// \param LB |
2197 | /// An integer that specifies the length of the string in \a B. |
2198 | /// \param M |
2199 | /// An 8-bit immediate operand specifying whether the characters are bytes or |
2200 | /// words and the type of comparison to perform. \n |
2201 | /// Bits [1:0]: Determine source data format. \n |
2202 | /// 00: 16 unsigned bytes \n |
2203 | /// 01: 8 unsigned words \n |
2204 | /// 10: 16 signed bytes \n |
2205 | /// 11: 8 signed words \n |
2206 | /// Bits [3:2]: Determine comparison type and aggregation method. \n |
2207 | /// 00: Subset: Each character in \a B is compared for equality with all |
2208 | /// the characters in \a A. \n |
2209 | /// 01: Ranges: Each character in \a B is compared to \a A. The comparison |
2210 | /// basis is greater than or equal for even-indexed elements in \a A, |
2211 | /// and less than or equal for odd-indexed elements in \a A. \n |
2212 | /// 10: Match: Compare each pair of corresponding characters in \a A and |
2213 | /// \a B for equality. \n |
2214 | /// 11: Substring: Search \a B for substring matches of \a A. \n |
2215 | /// Bits [5:4]: Determine whether to perform a one's complement on the bit |
2216 | /// mask of the comparison results. \n |
2217 | /// 00: No effect. \n |
2218 | /// 01: Negate the bit mask. \n |
2219 | /// 10: No effect. \n |
2220 | /// 11: Negate the bit mask only for bits with an index less than or equal |
2221 | /// to the size of \a A or \a B. |
2222 | /// \returns Returns bit 0 of the resulting bit mask. |
2223 | #define _mm_cmpestro(A, LA, B, LB, M) \ |
2224 | (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ |
2225 | (__v16qi)(__m128i)(B), (int)(LB), \ |
2226 | (int)(M)) |
2227 | |
2228 | /// Uses the immediate operand \a M to perform a comparison of string |
2229 | /// data with explicitly defined lengths that is contained in source operands |
2230 | /// \a A and \a B. Returns 1 if the length of the string in \a A is less than |
2231 | /// the maximum, otherwise, returns 0. |
2232 | /// |
2233 | /// \headerfile <x86intrin.h> |
2234 | /// |
2235 | /// \code |
2236 | /// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M); |
2237 | /// \endcode |
2238 | /// |
2239 | /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> |
2240 | /// instruction. |
2241 | /// |
2242 | /// \param A |
2243 | /// A 128-bit integer vector containing one of the source operands to be |
2244 | /// compared. |
2245 | /// \param LA |
2246 | /// An integer that specifies the length of the string in \a A. |
2247 | /// \param B |
2248 | /// A 128-bit integer vector containing one of the source operands to be |
2249 | /// compared. |
2250 | /// \param LB |
2251 | /// An integer that specifies the length of the string in \a B. |
2252 | /// \param M |
2253 | /// An 8-bit immediate operand specifying whether the characters are bytes or |
2254 | /// words and the type of comparison to perform. \n |
2255 | /// Bits [1:0]: Determine source data format. \n |
2256 | /// 00: 16 unsigned bytes \n |
2257 | /// 01: 8 unsigned words \n |
2258 | /// 10: 16 signed bytes \n |
2259 | /// 11: 8 signed words \n |
2260 | /// Bits [3:2]: Determine comparison type and aggregation method. \n |
2261 | /// 00: Subset: Each character in \a B is compared for equality with all |
2262 | /// the characters in \a A. \n |
2263 | /// 01: Ranges: Each character in \a B is compared to \a A. The comparison |
2264 | /// basis is greater than or equal for even-indexed elements in \a A, |
2265 | /// and less than or equal for odd-indexed elements in \a A. \n |
2266 | /// 10: Match: Compare each pair of corresponding characters in \a A and |
2267 | /// \a B for equality. \n |
2268 | /// 11: Substring: Search \a B for substring matches of \a A. \n |
2269 | /// Bits [5:4]: Determine whether to perform a one's complement in the bit |
2270 | /// mask of the comparison results. \n |
2271 | /// 00: No effect. \n |
2272 | /// 01: Negate the bit mask. \n |
2273 | /// 10: No effect. \n |
2274 | /// 11: Negate the bit mask only for bits with an index less than or equal |
2275 | /// to the size of \a A or \a B. \n |
2276 | /// \returns Returns 1 if the length of the string in \a A is less than the |
2277 | /// maximum, otherwise, returns 0. |
2278 | #define _mm_cmpestrs(A, LA, B, LB, M) \ |
2279 | (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ |
2280 | (__v16qi)(__m128i)(B), (int)(LB), \ |
2281 | (int)(M)) |
2282 | |
2283 | /// Uses the immediate operand \a M to perform a comparison of string |
2284 | /// data with explicitly defined lengths that is contained in source operands |
2285 | /// \a A and \a B. Returns 1 if the length of the string in \a B is less than |
2286 | /// the maximum, otherwise, returns 0. |
2287 | /// |
2288 | /// \headerfile <x86intrin.h> |
2289 | /// |
2290 | /// \code |
2291 | /// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M); |
2292 | /// \endcode |
2293 | /// |
2294 | /// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction. |
2295 | /// |
2296 | /// \param A |
2297 | /// A 128-bit integer vector containing one of the source operands to be |
2298 | /// compared. |
2299 | /// \param LA |
2300 | /// An integer that specifies the length of the string in \a A. |
2301 | /// \param B |
2302 | /// A 128-bit integer vector containing one of the source operands to be |
2303 | /// compared. |
2304 | /// \param LB |
2305 | /// An integer that specifies the length of the string in \a B. |
2306 | /// \param M |
2307 | /// An 8-bit immediate operand specifying whether the characters are bytes or |
2308 | /// words and the type of comparison to perform. \n |
2309 | /// Bits [1:0]: Determine source data format. \n |
2310 | /// 00: 16 unsigned bytes \n |
2311 | /// 01: 8 unsigned words \n |
2312 | /// 10: 16 signed bytes \n |
2313 | /// 11: 8 signed words \n |
2314 | /// Bits [3:2]: Determine comparison type and aggregation method. \n |
2315 | /// 00: Subset: Each character in \a B is compared for equality with all |
2316 | /// the characters in \a A. \n |
2317 | /// 01: Ranges: Each character in \a B is compared to \a A. The comparison |
2318 | /// basis is greater than or equal for even-indexed elements in \a A, |
2319 | /// and less than or equal for odd-indexed elements in \a A. \n |
2320 | /// 10: Match: Compare each pair of corresponding characters in \a A and |
2321 | /// \a B for equality. \n |
2322 | /// 11: Substring: Search \a B for substring matches of \a A. \n |
2323 | /// Bits [5:4]: Determine whether to perform a one's complement on the bit |
2324 | /// mask of the comparison results. \n |
2325 | /// 00: No effect. \n |
2326 | /// 01: Negate the bit mask. \n |
2327 | /// 10: No effect. \n |
2328 | /// 11: Negate the bit mask only for bits with an index less than or equal |
2329 | /// to the size of \a A or \a B. |
2330 | /// \returns Returns 1 if the length of the string in \a B is less than the |
2331 | /// maximum, otherwise, returns 0. |
2332 | #define _mm_cmpestrz(A, LA, B, LB, M) \ |
2333 | (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ |
2334 | (__v16qi)(__m128i)(B), (int)(LB), \ |
2335 | (int)(M)) |
2336 | |
2337 | /* SSE4.2 Compare Packed Data -- Greater Than. */ |
2338 | /// Compares each of the corresponding 64-bit values of the 128-bit |
2339 | /// integer vectors to determine if the values in the first operand are |
2340 | /// greater than those in the second operand. |
2341 | /// |
2342 | /// \headerfile <x86intrin.h> |
2343 | /// |
2344 | /// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction. |
2345 | /// |
2346 | /// \param __V1 |
2347 | /// A 128-bit integer vector. |
2348 | /// \param __V2 |
2349 | /// A 128-bit integer vector. |
2350 | /// \returns A 128-bit integer vector containing the comparison results. |
2351 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
2352 | _mm_cmpgt_epi64(__m128i __V1, __m128i __V2) |
2353 | { |
2354 | return (__m128i)((__v2di)__V1 > (__v2di)__V2); |
2355 | } |
2356 | |
2357 | /* SSE4.2 Accumulate CRC32. */ |
2358 | /// Adds the unsigned integer operand to the CRC-32C checksum of the |
2359 | /// unsigned char operand. |
2360 | /// |
2361 | /// \headerfile <x86intrin.h> |
2362 | /// |
2363 | /// This intrinsic corresponds to the <c> CRC32B </c> instruction. |
2364 | /// |
2365 | /// \param __C |
2366 | /// An unsigned integer operand to add to the CRC-32C checksum of operand |
2367 | /// \a __D. |
2368 | /// \param __D |
2369 | /// An unsigned 8-bit integer operand used to compute the CRC-32C checksum. |
2370 | /// \returns The result of adding operand \a __C to the CRC-32C checksum of |
2371 | /// operand \a __D. |
2372 | static __inline__ unsigned int __DEFAULT_FN_ATTRS |
2373 | _mm_crc32_u8(unsigned int __C, unsigned char __D) |
2374 | { |
2375 | return __builtin_ia32_crc32qi(__C, __D); |
2376 | } |
2377 | |
2378 | /// Adds the unsigned integer operand to the CRC-32C checksum of the |
2379 | /// unsigned short operand. |
2380 | /// |
2381 | /// \headerfile <x86intrin.h> |
2382 | /// |
2383 | /// This intrinsic corresponds to the <c> CRC32W </c> instruction. |
2384 | /// |
2385 | /// \param __C |
2386 | /// An unsigned integer operand to add to the CRC-32C checksum of operand |
2387 | /// \a __D. |
2388 | /// \param __D |
2389 | /// An unsigned 16-bit integer operand used to compute the CRC-32C checksum. |
2390 | /// \returns The result of adding operand \a __C to the CRC-32C checksum of |
2391 | /// operand \a __D. |
2392 | static __inline__ unsigned int __DEFAULT_FN_ATTRS |
2393 | _mm_crc32_u16(unsigned int __C, unsigned short __D) |
2394 | { |
2395 | return __builtin_ia32_crc32hi(__C, __D); |
2396 | } |
2397 | |
2398 | /// Adds the first unsigned integer operand to the CRC-32C checksum of |
2399 | /// the second unsigned integer operand. |
2400 | /// |
2401 | /// \headerfile <x86intrin.h> |
2402 | /// |
2403 | /// This intrinsic corresponds to the <c> CRC32L </c> instruction. |
2404 | /// |
2405 | /// \param __C |
2406 | /// An unsigned integer operand to add to the CRC-32C checksum of operand |
2407 | /// \a __D. |
2408 | /// \param __D |
2409 | /// An unsigned 32-bit integer operand used to compute the CRC-32C checksum. |
2410 | /// \returns The result of adding operand \a __C to the CRC-32C checksum of |
2411 | /// operand \a __D. |
2412 | static __inline__ unsigned int __DEFAULT_FN_ATTRS |
2413 | _mm_crc32_u32(unsigned int __C, unsigned int __D) |
2414 | { |
2415 | return __builtin_ia32_crc32si(__C, __D); |
2416 | } |
2417 | |
2418 | #ifdef __x86_64__ |
2419 | /// Adds the unsigned integer operand to the CRC-32C checksum of the |
2420 | /// unsigned 64-bit integer operand. |
2421 | /// |
2422 | /// \headerfile <x86intrin.h> |
2423 | /// |
2424 | /// This intrinsic corresponds to the <c> CRC32Q </c> instruction. |
2425 | /// |
2426 | /// \param __C |
2427 | /// An unsigned integer operand to add to the CRC-32C checksum of operand |
2428 | /// \a __D. |
2429 | /// \param __D |
2430 | /// An unsigned 64-bit integer operand used to compute the CRC-32C checksum. |
2431 | /// \returns The result of adding operand \a __C to the CRC-32C checksum of |
2432 | /// operand \a __D. |
2433 | static __inline__ unsigned long long __DEFAULT_FN_ATTRS |
2434 | _mm_crc32_u64(unsigned long long __C, unsigned long long __D) |
2435 | { |
2436 | return __builtin_ia32_crc32di(__C, __D); |
2437 | } |
2438 | #endif /* __x86_64__ */ |
2439 | |
2440 | #undef __DEFAULT_FN_ATTRS |
2441 | |
2442 | #include <popcntintrin.h> |
2443 | |
2444 | #endif /* __SMMINTRIN_H */ |
2445 |