smmintrin.h source code [clang_source_code/lib/Headers/smmintrin.h]

1	/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
2	*
3	* Permission is hereby granted, free of charge, to any person obtaining a copy
4	* of this software and associated documentation files (the "Software"), to deal
5	* in the Software without restriction, including without limitation the rights
6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7	* copies of the Software, and to permit persons to whom the Software is
8	* furnished to do so, subject to the following conditions:
9	*
10	* The above copyright notice and this permission notice shall be included in
11	* all copies or substantial portions of the Software.
12	*
13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19	* THE SOFTWARE.
20	*
21	*===-----------------------------------------------------------------------===
22	*/
23
24	#ifndef __SMMINTRIN_H
25	#define __SMMINTRIN_H
26
27	#include <tmmintrin.h>
28
29	/* Define the default attributes for the functions in this file. */
30	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128)))
31
32	/* SSE4 Rounding macros. */
33	#define _MM_FROUND_TO_NEAREST_INT 0x00
34	#define _MM_FROUND_TO_NEG_INF 0x01
35	#define _MM_FROUND_TO_POS_INF 0x02
36	#define _MM_FROUND_TO_ZERO 0x03
37	#define _MM_FROUND_CUR_DIRECTION 0x04
38
39	#define _MM_FROUND_RAISE_EXC 0x00
40	#define _MM_FROUND_NO_EXC 0x08
41
42	#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_NEAREST_INT)
43	#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_NEG_INF)
44	#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_POS_INF)
45	#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_ZERO)
46	#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC \| _MM_FROUND_CUR_DIRECTION)
47	#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC \| _MM_FROUND_CUR_DIRECTION)
48
49	/// Rounds up each element of the 128-bit vector of [4 x float] to an
50	/// integer and returns the rounded values in a 128-bit vector of
51	/// [4 x float].
52	///
53	/// \headerfile <x86intrin.h>
54	///
55	/// \code
56	/// __m128 _mm_ceil_ps(__m128 X);
57	/// \endcode
58	///
59	/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
60	///
61	/// \param X
62	/// A 128-bit vector of [4 x float] values to be rounded up.
63	/// \returns A 128-bit vector of [4 x float] containing the rounded values.
64	#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
65
66	/// Rounds up each element of the 128-bit vector of [2 x double] to an
67	/// integer and returns the rounded values in a 128-bit vector of
68	/// [2 x double].
69	///
70	/// \headerfile <x86intrin.h>
71	///
72	/// \code
73	/// __m128d _mm_ceil_pd(__m128d X);
74	/// \endcode
75	///
76	/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
77	///
78	/// \param X
79	/// A 128-bit vector of [2 x double] values to be rounded up.
80	/// \returns A 128-bit vector of [2 x double] containing the rounded values.
81	#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
82
83	/// Copies three upper elements of the first 128-bit vector operand to
84	/// the corresponding three upper elements of the 128-bit result vector of
85	/// [4 x float]. Rounds up the lowest element of the second 128-bit vector
86	/// operand to an integer and copies it to the lowest element of the 128-bit
87	/// result vector of [4 x float].
88	///
89	/// \headerfile <x86intrin.h>
90	///
91	/// \code
92	/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
93	/// \endcode
94	///
95	/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
96	///
97	/// \param X
98	/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
99	/// copied to the corresponding bits of the result.
100	/// \param Y
101	/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
102	/// rounded up to the nearest integer and copied to the corresponding bits
103	/// of the result.
104	/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
105	/// values.
106	#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
107
108	/// Copies the upper element of the first 128-bit vector operand to the
109	/// corresponding upper element of the 128-bit result vector of [2 x double].
110	/// Rounds up the lower element of the second 128-bit vector operand to an
111	/// integer and copies it to the lower element of the 128-bit result vector
112	/// of [2 x double].
113	///
114	/// \headerfile <x86intrin.h>
115	///
116	/// \code
117	/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
118	/// \endcode
119	///
120	/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
121	///
122	/// \param X
123	/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
124	/// copied to the corresponding bits of the result.
125	/// \param Y
126	/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
127	/// rounded up to the nearest integer and copied to the corresponding bits
128	/// of the result.
129	/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
130	/// values.
131	#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
132
133	/// Rounds down each element of the 128-bit vector of [4 x float] to an
134	/// an integer and returns the rounded values in a 128-bit vector of
135	/// [4 x float].
136	///
137	/// \headerfile <x86intrin.h>
138	///
139	/// \code
140	/// __m128 _mm_floor_ps(__m128 X);
141	/// \endcode
142	///
143	/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
144	///
145	/// \param X
146	/// A 128-bit vector of [4 x float] values to be rounded down.
147	/// \returns A 128-bit vector of [4 x float] containing the rounded values.
148	#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
149
150	/// Rounds down each element of the 128-bit vector of [2 x double] to an
151	/// integer and returns the rounded values in a 128-bit vector of
152	/// [2 x double].
153	///
154	/// \headerfile <x86intrin.h>
155	///
156	/// \code
157	/// __m128d _mm_floor_pd(__m128d X);
158	/// \endcode
159	///
160	/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
161	///
162	/// \param X
163	/// A 128-bit vector of [2 x double].
164	/// \returns A 128-bit vector of [2 x double] containing the rounded values.
165	#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
166
167	/// Copies three upper elements of the first 128-bit vector operand to
168	/// the corresponding three upper elements of the 128-bit result vector of
169	/// [4 x float]. Rounds down the lowest element of the second 128-bit vector
170	/// operand to an integer and copies it to the lowest element of the 128-bit
171	/// result vector of [4 x float].
172	///
173	/// \headerfile <x86intrin.h>
174	///
175	/// \code
176	/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
177	/// \endcode
178	///
179	/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
180	///
181	/// \param X
182	/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
183	/// copied to the corresponding bits of the result.
184	/// \param Y
185	/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
186	/// rounded down to the nearest integer and copied to the corresponding bits
187	/// of the result.
188	/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
189	/// values.
190	#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
191
192	/// Copies the upper element of the first 128-bit vector operand to the
193	/// corresponding upper element of the 128-bit result vector of [2 x double].
194	/// Rounds down the lower element of the second 128-bit vector operand to an
195	/// integer and copies it to the lower element of the 128-bit result vector
196	/// of [2 x double].
197	///
198	/// \headerfile <x86intrin.h>
199	///
200	/// \code
201	/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
202	/// \endcode
203	///
204	/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
205	///
206	/// \param X
207	/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
208	/// copied to the corresponding bits of the result.
209	/// \param Y
210	/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
211	/// rounded down to the nearest integer and copied to the corresponding bits
212	/// of the result.
213	/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
214	/// values.
215	#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
216
217	/// Rounds each element of the 128-bit vector of [4 x float] to an
218	/// integer value according to the rounding control specified by the second
219	/// argument and returns the rounded values in a 128-bit vector of
220	/// [4 x float].
221	///
222	/// \headerfile <x86intrin.h>
223	///
224	/// \code
225	/// __m128 _mm_round_ps(__m128 X, const int M);
226	/// \endcode
227	///
228	/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
229	///
230	/// \param X
231	/// A 128-bit vector of [4 x float].
232	/// \param M
233	/// An integer value that specifies the rounding operation. \n
234	/// Bits [7:4] are reserved. \n
235	/// Bit [3] is a precision exception value: \n
236	/// 0: A normal PE exception is used \n
237	/// 1: The PE field is not updated \n
238	/// Bit [2] is the rounding control source: \n
239	/// 0: Use bits [1:0] of \a M \n
240	/// 1: Use the current MXCSR setting \n
241	/// Bits [1:0] contain the rounding control definition: \n
242	/// 00: Nearest \n
243	/// 01: Downward (toward negative infinity) \n
244	/// 10: Upward (toward positive infinity) \n
245	/// 11: Truncated
246	/// \returns A 128-bit vector of [4 x float] containing the rounded values.
247	#define _mm_round_ps(X, M) \
248	(__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))
249
250	/// Copies three upper elements of the first 128-bit vector operand to
251	/// the corresponding three upper elements of the 128-bit result vector of
252	/// [4 x float]. Rounds the lowest element of the second 128-bit vector
253	/// operand to an integer value according to the rounding control specified
254	/// by the third argument and copies it to the lowest element of the 128-bit
255	/// result vector of [4 x float].
256	///
257	/// \headerfile <x86intrin.h>
258	///
259	/// \code
260	/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
261	/// \endcode
262	///
263	/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
264	///
265	/// \param X
266	/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
267	/// copied to the corresponding bits of the result.
268	/// \param Y
269	/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
270	/// rounded to the nearest integer using the specified rounding control and
271	/// copied to the corresponding bits of the result.
272	/// \param M
273	/// An integer value that specifies the rounding operation. \n
274	/// Bits [7:4] are reserved. \n
275	/// Bit [3] is a precision exception value: \n
276	/// 0: A normal PE exception is used \n
277	/// 1: The PE field is not updated \n
278	/// Bit [2] is the rounding control source: \n
279	/// 0: Use bits [1:0] of \a M \n
280	/// 1: Use the current MXCSR setting \n
281	/// Bits [1:0] contain the rounding control definition: \n
282	/// 00: Nearest \n
283	/// 01: Downward (toward negative infinity) \n
284	/// 10: Upward (toward positive infinity) \n
285	/// 11: Truncated
286	/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
287	/// values.
288	#define _mm_round_ss(X, Y, M) \
289	(__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
290	(__v4sf)(__m128)(Y), (M))
291
292	/// Rounds each element of the 128-bit vector of [2 x double] to an
293	/// integer value according to the rounding control specified by the second
294	/// argument and returns the rounded values in a 128-bit vector of
295	/// [2 x double].
296	///
297	/// \headerfile <x86intrin.h>
298	///
299	/// \code
300	/// __m128d _mm_round_pd(__m128d X, const int M);
301	/// \endcode
302	///
303	/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
304	///
305	/// \param X
306	/// A 128-bit vector of [2 x double].
307	/// \param M
308	/// An integer value that specifies the rounding operation. \n
309	/// Bits [7:4] are reserved. \n
310	/// Bit [3] is a precision exception value: \n
311	/// 0: A normal PE exception is used \n
312	/// 1: The PE field is not updated \n
313	/// Bit [2] is the rounding control source: \n
314	/// 0: Use bits [1:0] of \a M \n
315	/// 1: Use the current MXCSR setting \n
316	/// Bits [1:0] contain the rounding control definition: \n
317	/// 00: Nearest \n
318	/// 01: Downward (toward negative infinity) \n
319	/// 10: Upward (toward positive infinity) \n
320	/// 11: Truncated
321	/// \returns A 128-bit vector of [2 x double] containing the rounded values.
322	#define _mm_round_pd(X, M) \
323	(__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))
324
325	/// Copies the upper element of the first 128-bit vector operand to the
326	/// corresponding upper element of the 128-bit result vector of [2 x double].
327	/// Rounds the lower element of the second 128-bit vector operand to an
328	/// integer value according to the rounding control specified by the third
329	/// argument and copies it to the lower element of the 128-bit result vector
330	/// of [2 x double].
331	///
332	/// \headerfile <x86intrin.h>
333	///
334	/// \code
335	/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
336	/// \endcode
337	///
338	/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
339	///
340	/// \param X
341	/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
342	/// copied to the corresponding bits of the result.
343	/// \param Y
344	/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
345	/// rounded to the nearest integer using the specified rounding control and
346	/// copied to the corresponding bits of the result.
347	/// \param M
348	/// An integer value that specifies the rounding operation. \n
349	/// Bits [7:4] are reserved. \n
350	/// Bit [3] is a precision exception value: \n
351	/// 0: A normal PE exception is used \n
352	/// 1: The PE field is not updated \n
353	/// Bit [2] is the rounding control source: \n
354	/// 0: Use bits [1:0] of \a M \n
355	/// 1: Use the current MXCSR setting \n
356	/// Bits [1:0] contain the rounding control definition: \n
357	/// 00: Nearest \n
358	/// 01: Downward (toward negative infinity) \n
359	/// 10: Upward (toward positive infinity) \n
360	/// 11: Truncated
361	/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
362	/// values.
363	#define _mm_round_sd(X, Y, M) \
364	(__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
365	(__v2df)(__m128d)(Y), (M))
366
367	/* SSE4 Packed Blending Intrinsics. */
368	/// Returns a 128-bit vector of [2 x double] where the values are
369	/// selected from either the first or second operand as specified by the
370	/// third operand, the control mask.
371	///
372	/// \headerfile <x86intrin.h>
373	///
374	/// \code
375	/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
376	/// \endcode
377	///
378	/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
379	///
380	/// \param V1
381	/// A 128-bit vector of [2 x double].
382	/// \param V2
383	/// A 128-bit vector of [2 x double].
384	/// \param M
385	/// An immediate integer operand, with mask bits [1:0] specifying how the
386	/// values are to be copied. The position of the mask bit corresponds to the
387	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
388	/// element in operand \a V1 is copied to the same position in the result.
389	/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2
390	/// is copied to the same position in the result.
391	/// \returns A 128-bit vector of [2 x double] containing the copied values.
392	#define _mm_blend_pd(V1, V2, M) \
393	(__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
394	(__v2df)(__m128d)(V2), (int)(M))
395
396	/// Returns a 128-bit vector of [4 x float] where the values are selected
397	/// from either the first or second operand as specified by the third
398	/// operand, the control mask.
399	///
400	/// \headerfile <x86intrin.h>
401	///
402	/// \code
403	/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
404	/// \endcode
405	///
406	/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
407	///
408	/// \param V1
409	/// A 128-bit vector of [4 x float].
410	/// \param V2
411	/// A 128-bit vector of [4 x float].
412	/// \param M
413	/// An immediate integer operand, with mask bits [3:0] specifying how the
414	/// values are to be copied. The position of the mask bit corresponds to the
415	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
416	/// element in operand \a V1 is copied to the same position in the result.
417	/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2
418	/// is copied to the same position in the result.
419	/// \returns A 128-bit vector of [4 x float] containing the copied values.
420	#define _mm_blend_ps(V1, V2, M) \
421	(__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
422	(__v4sf)(__m128)(V2), (int)(M))
423
424	/// Returns a 128-bit vector of [2 x double] where the values are
425	/// selected from either the first or second operand as specified by the
426	/// third operand, the control mask.
427	///
428	/// \headerfile <x86intrin.h>
429	///
430	/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
431	///
432	/// \param __V1
433	/// A 128-bit vector of [2 x double].
434	/// \param __V2
435	/// A 128-bit vector of [2 x double].
436	/// \param __M
437	/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the
438	/// values are to be copied. The position of the mask bit corresponds to the
439	/// most significant bit of a copied value. When a mask bit is 0, the
440	/// corresponding 64-bit element in operand \a __V1 is copied to the same
441	/// position in the result. When a mask bit is 1, the corresponding 64-bit
442	/// element in operand \a __V2 is copied to the same position in the result.
443	/// \returns A 128-bit vector of [2 x double] containing the copied values.
444	static __inline__ __m128d __DEFAULT_FN_ATTRS
445	_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
446	{
447	return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
448	(__v2df)__M);
449	}
450
451	/// Returns a 128-bit vector of [4 x float] where the values are
452	/// selected from either the first or second operand as specified by the
453	/// third operand, the control mask.
454	///
455	/// \headerfile <x86intrin.h>
456	///
457	/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
458	///
459	/// \param __V1
460	/// A 128-bit vector of [4 x float].
461	/// \param __V2
462	/// A 128-bit vector of [4 x float].
463	/// \param __M
464	/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
465	/// how the values are to be copied. The position of the mask bit corresponds
466	/// to the most significant bit of a copied value. When a mask bit is 0, the
467	/// corresponding 32-bit element in operand \a __V1 is copied to the same
468	/// position in the result. When a mask bit is 1, the corresponding 32-bit
469	/// element in operand \a __V2 is copied to the same position in the result.
470	/// \returns A 128-bit vector of [4 x float] containing the copied values.
471	static __inline__ __m128 __DEFAULT_FN_ATTRS
472	_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
473	{
474	return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
475	(__v4sf)__M);
476	}
477
478	/// Returns a 128-bit vector of [16 x i8] where the values are selected
479	/// from either of the first or second operand as specified by the third
480	/// operand, the control mask.
481	///
482	/// \headerfile <x86intrin.h>
483	///
484	/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
485	///
486	/// \param __V1
487	/// A 128-bit vector of [16 x i8].
488	/// \param __V2
489	/// A 128-bit vector of [16 x i8].
490	/// \param __M
491	/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
492	/// how the values are to be copied. The position of the mask bit corresponds
493	/// to the most significant bit of a copied value. When a mask bit is 0, the
494	/// corresponding 8-bit element in operand \a __V1 is copied to the same
495	/// position in the result. When a mask bit is 1, the corresponding 8-bit
496	/// element in operand \a __V2 is copied to the same position in the result.
497	/// \returns A 128-bit vector of [16 x i8] containing the copied values.
498	static __inline__ __m128i __DEFAULT_FN_ATTRS
499	_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
500	{
501	return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
502	(__v16qi)__M);
503	}
504
505	/// Returns a 128-bit vector of [8 x i16] where the values are selected
506	/// from either of the first or second operand as specified by the third
507	/// operand, the control mask.
508	///
509	/// \headerfile <x86intrin.h>
510	///
511	/// \code
512	/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
513	/// \endcode
514	///
515	/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
516	///
517	/// \param V1
518	/// A 128-bit vector of [8 x i16].
519	/// \param V2
520	/// A 128-bit vector of [8 x i16].
521	/// \param M
522	/// An immediate integer operand, with mask bits [7:0] specifying how the
523	/// values are to be copied. The position of the mask bit corresponds to the
524	/// index of a copied value. When a mask bit is 0, the corresponding 16-bit
525	/// element in operand \a V1 is copied to the same position in the result.
526	/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2
527	/// is copied to the same position in the result.
528	/// \returns A 128-bit vector of [8 x i16] containing the copied values.
529	#define _mm_blend_epi16(V1, V2, M) \
530	(__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
531	(__v8hi)(__m128i)(V2), (int)(M))
532
533	/* SSE4 Dword Multiply Instructions. */
534	/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
535	/// and returns the lower 32 bits of the each product in a 128-bit vector of
536	/// [4 x i32].
537	///
538	/// \headerfile <x86intrin.h>
539	///
540	/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
541	///
542	/// \param __V1
543	/// A 128-bit integer vector.
544	/// \param __V2
545	/// A 128-bit integer vector.
546	/// \returns A 128-bit integer vector containing the products of both operands.
547	static __inline__ __m128i __DEFAULT_FN_ATTRS
548	_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
549	{
550	return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
551	}
552
553	/// Multiplies corresponding even-indexed elements of two 128-bit
554	/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
555	/// containing the products.
556	///
557	/// \headerfile <x86intrin.h>
558	///
559	/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
560	///
561	/// \param __V1
562	/// A 128-bit vector of [4 x i32].
563	/// \param __V2
564	/// A 128-bit vector of [4 x i32].
565	/// \returns A 128-bit vector of [2 x i64] containing the products of both
566	/// operands.
567	static __inline__ __m128i __DEFAULT_FN_ATTRS
568	_mm_mul_epi32 (__m128i __V1, __m128i __V2)
569	{
570	return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
571	}
572
573	/* SSE4 Floating Point Dot Product Instructions. */
574	/// Computes the dot product of the two 128-bit vectors of [4 x float]
575	/// and returns it in the elements of the 128-bit result vector of
576	/// [4 x float].
577	///
578	/// The immediate integer operand controls which input elements
579	/// will contribute to the dot product, and where the final results are
580	/// returned.
581	///
582	/// \headerfile <x86intrin.h>
583	///
584	/// \code
585	/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
586	/// \endcode
587	///
588	/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
589	///
590	/// \param X
591	/// A 128-bit vector of [4 x float].
592	/// \param Y
593	/// A 128-bit vector of [4 x float].
594	/// \param M
595	/// An immediate integer operand. Mask bits [7:4] determine which elements
596	/// of the input vectors are used, with bit [4] corresponding to the lowest
597	/// element and bit [7] corresponding to the highest element of each [4 x
598	/// float] vector. If a bit is set, the corresponding elements from the two
599	/// input vectors are used as an input for dot product; otherwise that input
600	/// is treated as zero. Bits [3:0] determine which elements of the result
601	/// will receive a copy of the final dot product, with bit [0] corresponding
602	/// to the lowest element and bit [3] corresponding to the highest element of
603	/// each [4 x float] subvector. If a bit is set, the dot product is returned
604	/// in the corresponding element; otherwise that element is set to zero.
605	/// \returns A 128-bit vector of [4 x float] containing the dot product.
606	#define _mm_dp_ps(X, Y, M) \
607	(__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
608	(__v4sf)(__m128)(Y), (M))
609
610	/// Computes the dot product of the two 128-bit vectors of [2 x double]
611	/// and returns it in the elements of the 128-bit result vector of
612	/// [2 x double].
613	///
614	/// The immediate integer operand controls which input
615	/// elements will contribute to the dot product, and where the final results
616	/// are returned.
617	///
618	/// \headerfile <x86intrin.h>
619	///
620	/// \code
621	/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
622	/// \endcode
623	///
624	/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
625	///
626	/// \param X
627	/// A 128-bit vector of [2 x double].
628	/// \param Y
629	/// A 128-bit vector of [2 x double].
630	/// \param M
631	/// An immediate integer operand. Mask bits [5:4] determine which elements
632	/// of the input vectors are used, with bit [4] corresponding to the lowest
633	/// element and bit [5] corresponding to the highest element of each of [2 x
634	/// double] vector. If a bit is set, the corresponding elements from the two
635	/// input vectors are used as an input for dot product; otherwise that input
636	/// is treated as zero. Bits [1:0] determine which elements of the result
637	/// will receive a copy of the final dot product, with bit [0] corresponding
638	/// to the lowest element and bit [1] corresponding to the highest element of
639	/// each [2 x double] vector. If a bit is set, the dot product is returned in
640	/// the corresponding element; otherwise that element is set to zero.
641	#define _mm_dp_pd(X, Y, M) \
642	(__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
643	(__v2df)(__m128d)(Y), (M))
644
645	/* SSE4 Streaming Load Hint Instruction. */
646	/// Loads integer values from a 128-bit aligned memory location to a
647	/// 128-bit integer vector.
648	///
649	/// \headerfile <x86intrin.h>
650	///
651	/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
652	///
653	/// \param __V
654	/// A pointer to a 128-bit aligned memory location that contains the integer
655	/// values.
656	/// \returns A 128-bit integer vector containing the data stored at the
657	/// specified memory location.
658	static __inline__ __m128i __DEFAULT_FN_ATTRS
659	_mm_stream_load_si128 (__m128i const *__V)
660	{
661	return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
662	}
663
664	/* SSE4 Packed Integer Min/Max Instructions. */
665	/// Compares the corresponding elements of two 128-bit vectors of
666	/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
667	/// of the two values.
668	///
669	/// \headerfile <x86intrin.h>
670	///
671	/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
672	///
673	/// \param __V1
674	/// A 128-bit vector of [16 x i8].
675	/// \param __V2
676	/// A 128-bit vector of [16 x i8]
677	/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
678	static __inline__ __m128i __DEFAULT_FN_ATTRS
679	_mm_min_epi8 (__m128i __V1, __m128i __V2)
680	{
681	return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
682	}
683
684	/// Compares the corresponding elements of two 128-bit vectors of
685	/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
686	/// greater value of the two.
687	///
688	/// \headerfile <x86intrin.h>
689	///
690	/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
691	///
692	/// \param __V1
693	/// A 128-bit vector of [16 x i8].
694	/// \param __V2
695	/// A 128-bit vector of [16 x i8].
696	/// \returns A 128-bit vector of [16 x i8] containing the greater values.
697	static __inline__ __m128i __DEFAULT_FN_ATTRS
698	_mm_max_epi8 (__m128i __V1, __m128i __V2)
699	{
700	return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
701	}
702
703	/// Compares the corresponding elements of two 128-bit vectors of
704	/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
705	/// value of the two.
706	///
707	/// \headerfile <x86intrin.h>
708	///
709	/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
710	///
711	/// \param __V1
712	/// A 128-bit vector of [8 x u16].
713	/// \param __V2
714	/// A 128-bit vector of [8 x u16].
715	/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
716	static __inline__ __m128i __DEFAULT_FN_ATTRS
717	_mm_min_epu16 (__m128i __V1, __m128i __V2)
718	{
719	return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
720	}
721
722	/// Compares the corresponding elements of two 128-bit vectors of
723	/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
724	/// greater value of the two.
725	///
726	/// \headerfile <x86intrin.h>
727	///
728	/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
729	///
730	/// \param __V1
731	/// A 128-bit vector of [8 x u16].
732	/// \param __V2
733	/// A 128-bit vector of [8 x u16].
734	/// \returns A 128-bit vector of [8 x u16] containing the greater values.
735	static __inline__ __m128i __DEFAULT_FN_ATTRS
736	_mm_max_epu16 (__m128i __V1, __m128i __V2)
737	{
738	return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
739	}
740
741	/// Compares the corresponding elements of two 128-bit vectors of
742	/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
743	/// value of the two.
744	///
745	/// \headerfile <x86intrin.h>
746	///
747	/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
748	///
749	/// \param __V1
750	/// A 128-bit vector of [4 x i32].
751	/// \param __V2
752	/// A 128-bit vector of [4 x i32].
753	/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
754	static __inline__ __m128i __DEFAULT_FN_ATTRS
755	_mm_min_epi32 (__m128i __V1, __m128i __V2)
756	{
757	return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
758	}
759
760	/// Compares the corresponding elements of two 128-bit vectors of
761	/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
762	/// greater value of the two.
763	///
764	/// \headerfile <x86intrin.h>
765	///
766	/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
767	///
768	/// \param __V1
769	/// A 128-bit vector of [4 x i32].
770	/// \param __V2
771	/// A 128-bit vector of [4 x i32].
772	/// \returns A 128-bit vector of [4 x i32] containing the greater values.
773	static __inline__ __m128i __DEFAULT_FN_ATTRS
774	_mm_max_epi32 (__m128i __V1, __m128i __V2)
775	{
776	return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
777	}
778
779	/// Compares the corresponding elements of two 128-bit vectors of
780	/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
781	/// value of the two.
782	///
783	/// \headerfile <x86intrin.h>
784	///
785	/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction.
786	///
787	/// \param __V1
788	/// A 128-bit vector of [4 x u32].
789	/// \param __V2
790	/// A 128-bit vector of [4 x u32].
791	/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
792	static __inline__ __m128i __DEFAULT_FN_ATTRS
793	_mm_min_epu32 (__m128i __V1, __m128i __V2)
794	{
795	return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
796	}
797
798	/// Compares the corresponding elements of two 128-bit vectors of
799	/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
800	/// greater value of the two.
801	///
802	/// \headerfile <x86intrin.h>
803	///
804	/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
805	///
806	/// \param __V1
807	/// A 128-bit vector of [4 x u32].
808	/// \param __V2
809	/// A 128-bit vector of [4 x u32].
810	/// \returns A 128-bit vector of [4 x u32] containing the greater values.
811	static __inline__ __m128i __DEFAULT_FN_ATTRS
812	_mm_max_epu32 (__m128i __V1, __m128i __V2)
813	{
814	return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
815	}
816
817	/* SSE4 Insertion and Extraction from XMM Register Instructions. */
818	/// Takes the first argument \a X and inserts an element from the second
819	/// argument \a Y as selected by the third argument \a N. That result then
820	/// has elements zeroed out also as selected by the third argument \a N. The
821	/// resulting 128-bit vector of [4 x float] is then returned.
822	///
823	/// \headerfile <x86intrin.h>
824	///
825	/// \code
826	/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
827	/// \endcode
828	///
829	/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
830	///
831	/// \param X
832	/// A 128-bit vector source operand of [4 x float]. With the exception of
833	/// those bits in the result copied from parameter \a Y and zeroed by bits
834	/// [3:0] of \a N, all bits from this parameter are copied to the result.
835	/// \param Y
836	/// A 128-bit vector source operand of [4 x float]. One single-precision
837	/// floating-point element from this source, as determined by the immediate
838	/// parameter, is copied to the result.
839	/// \param N
840	/// Specifies which bits from operand \a Y will be copied, which bits in the
841	/// result they will be be copied to, and which bits in the result will be
842	/// cleared. The following assignments are made: \n
843	/// Bits [7:6] specify the bits to copy from operand \a Y: \n
844	/// 00: Selects bits [31:0] from operand \a Y. \n
845	/// 01: Selects bits [63:32] from operand \a Y. \n
846	/// 10: Selects bits [95:64] from operand \a Y. \n
847	/// 11: Selects bits [127:96] from operand \a Y. \n
848	/// Bits [5:4] specify the bits in the result to which the selected bits
849	/// from operand \a Y are copied: \n
850	/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n
851	/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n
852	/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n
853	/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n
854	/// Bits[3:0]: If any of these bits are set, the corresponding result
855	/// element is cleared.
856	/// \returns A 128-bit vector of [4 x float] containing the copied
857	/// single-precision floating point elements from the operands.
858	#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
859
860	/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
861	/// returns it, using the immediate value parameter \a N as a selector.
862	///
863	/// \headerfile <x86intrin.h>
864	///
865	/// \code
866	/// int _mm_extract_ps(__m128 X, const int N);
867	/// \endcode
868	///
869	/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
870	/// instruction.
871	///
872	/// \param X
873	/// A 128-bit vector of [4 x float].
874	/// \param N
875	/// An immediate value. Bits [1:0] determines which bits from the argument
876	/// \a X are extracted and returned: \n
877	/// 00: Bits [31:0] of parameter \a X are returned. \n
878	/// 01: Bits [63:32] of parameter \a X are returned. \n
879	/// 10: Bits [95:64] of parameter \a X are returned. \n
880	/// 11: Bits [127:96] of parameter \a X are returned.
881	/// \returns A 32-bit integer containing the extracted 32 bits of float data.
882	#define _mm_extract_ps(X, N) (__extension__ \
883	({ union { int __i; float __f; } __t; \
884	__t.__f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \
885	__t.__i;}))
886
887	/* Miscellaneous insert and extract macros. */
888	/* Extract a single-precision float from X at index N into D. */
889	#define _MM_EXTRACT_FLOAT(D, X, N) \
890	{ (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); }
891
892	/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
893	an index suitable for _mm_insert_ps. */
894	#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) \| ((Y) << 4) \| (Z))
895
896	/* Extract a float from X at index N into the first index of the return. */
897	#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \
898	_MM_MK_INSERTPS_NDX((N), 0, 0x0e))
899
900	/* Insert int into packed integer array at index. */
901	/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
902	/// the 128-bit integer vector parameter, and then inserting the lower 8 bits
903	/// of an integer parameter \a I into an offset specified by the immediate
904	/// value parameter \a N.
905	///
906	/// \headerfile <x86intrin.h>
907	///
908	/// \code
909	/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
910	/// \endcode
911	///
912	/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
913	///
914	/// \param X
915	/// A 128-bit integer vector of [16 x i8]. This vector is copied to the
916	/// result and then one of the sixteen elements in the result vector is
917	/// replaced by the lower 8 bits of \a I.
918	/// \param I
919	/// An integer. The lower 8 bits of this operand are written to the result
920	/// beginning at the offset specified by \a N.
921	/// \param N
922	/// An immediate value. Bits [3:0] specify the bit offset in the result at
923	/// which the lower 8 bits of \a I are written. \n
924	/// 0000: Bits [7:0] of the result are used for insertion. \n
925	/// 0001: Bits [15:8] of the result are used for insertion. \n
926	/// 0010: Bits [23:16] of the result are used for insertion. \n
927	/// 0011: Bits [31:24] of the result are used for insertion. \n
928	/// 0100: Bits [39:32] of the result are used for insertion. \n
929	/// 0101: Bits [47:40] of the result are used for insertion. \n
930	/// 0110: Bits [55:48] of the result are used for insertion. \n
931	/// 0111: Bits [63:56] of the result are used for insertion. \n
932	/// 1000: Bits [71:64] of the result are used for insertion. \n
933	/// 1001: Bits [79:72] of the result are used for insertion. \n
934	/// 1010: Bits [87:80] of the result are used for insertion. \n
935	/// 1011: Bits [95:88] of the result are used for insertion. \n
936	/// 1100: Bits [103:96] of the result are used for insertion. \n
937	/// 1101: Bits [111:104] of the result are used for insertion. \n
938	/// 1110: Bits [119:112] of the result are used for insertion. \n
939	/// 1111: Bits [127:120] of the result are used for insertion.
940	/// \returns A 128-bit integer vector containing the constructed values.
941	#define _mm_insert_epi8(X, I, N) \
942	(__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
943	(int)(I), (int)(N))
944
945	/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
946	/// the 128-bit integer vector parameter, and then inserting the 32-bit
947	/// integer parameter \a I at the offset specified by the immediate value
948	/// parameter \a N.
949	///
950	/// \headerfile <x86intrin.h>
951	///
952	/// \code
953	/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
954	/// \endcode
955	///
956	/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
957	///
958	/// \param X
959	/// A 128-bit integer vector of [4 x i32]. This vector is copied to the
960	/// result and then one of the four elements in the result vector is
961	/// replaced by \a I.
962	/// \param I
963	/// A 32-bit integer that is written to the result beginning at the offset
964	/// specified by \a N.
965	/// \param N
966	/// An immediate value. Bits [1:0] specify the bit offset in the result at
967	/// which the integer \a I is written. \n
968	/// 00: Bits [31:0] of the result are used for insertion. \n
969	/// 01: Bits [63:32] of the result are used for insertion. \n
970	/// 10: Bits [95:64] of the result are used for insertion. \n
971	/// 11: Bits [127:96] of the result are used for insertion.
972	/// \returns A 128-bit integer vector containing the constructed values.
973	#define _mm_insert_epi32(X, I, N) \
974	(__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
975	(int)(I), (int)(N))
976
977	#ifdef __x86_64__
978	/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
979	/// the 128-bit integer vector parameter, and then inserting the 64-bit
980	/// integer parameter \a I, using the immediate value parameter \a N as an
981	/// insertion location selector.
982	///
983	/// \headerfile <x86intrin.h>
984	///
985	/// \code
986	/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
987	/// \endcode
988	///
989	/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
990	///
991	/// \param X
992	/// A 128-bit integer vector of [2 x i64]. This vector is copied to the
993	/// result and then one of the two elements in the result vector is replaced
994	/// by \a I.
995	/// \param I
996	/// A 64-bit integer that is written to the result beginning at the offset
997	/// specified by \a N.
998	/// \param N
999	/// An immediate value. Bit [0] specifies the bit offset in the result at
1000	/// which the integer \a I is written. \n
1001	/// 0: Bits [63:0] of the result are used for insertion. \n
1002	/// 1: Bits [127:64] of the result are used for insertion. \n
1003	/// \returns A 128-bit integer vector containing the constructed values.
1004	#define _mm_insert_epi64(X, I, N) \
1005	(__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
1006	(long long)(I), (int)(N))
1007	#endif /* __x86_64__ */
1008
1009	/* Extract int from packed integer array at index. This returns the element
1010	* as a zero extended value, so it is unsigned.
1011	*/
1012	/// Extracts an 8-bit element from the 128-bit integer vector of
1013	/// [16 x i8], using the immediate value parameter \a N as a selector.
1014	///
1015	/// \headerfile <x86intrin.h>
1016	///
1017	/// \code
1018	/// int _mm_extract_epi8(__m128i X, const int N);
1019	/// \endcode
1020	///
1021	/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
1022	///
1023	/// \param X
1024	/// A 128-bit integer vector.
1025	/// \param N
1026	/// An immediate value. Bits [3:0] specify which 8-bit vector element from
1027	/// the argument \a X to extract and copy to the result. \n
1028	/// 0000: Bits [7:0] of parameter \a X are extracted. \n
1029	/// 0001: Bits [15:8] of the parameter \a X are extracted. \n
1030	/// 0010: Bits [23:16] of the parameter \a X are extracted. \n
1031	/// 0011: Bits [31:24] of the parameter \a X are extracted. \n
1032	/// 0100: Bits [39:32] of the parameter \a X are extracted. \n
1033	/// 0101: Bits [47:40] of the parameter \a X are extracted. \n
1034	/// 0110: Bits [55:48] of the parameter \a X are extracted. \n
1035	/// 0111: Bits [63:56] of the parameter \a X are extracted. \n
1036	/// 1000: Bits [71:64] of the parameter \a X are extracted. \n
1037	/// 1001: Bits [79:72] of the parameter \a X are extracted. \n
1038	/// 1010: Bits [87:80] of the parameter \a X are extracted. \n
1039	/// 1011: Bits [95:88] of the parameter \a X are extracted. \n
1040	/// 1100: Bits [103:96] of the parameter \a X are extracted. \n
1041	/// 1101: Bits [111:104] of the parameter \a X are extracted. \n
1042	/// 1110: Bits [119:112] of the parameter \a X are extracted. \n
1043	/// 1111: Bits [127:120] of the parameter \a X are extracted.
1044	/// \returns An unsigned integer, whose lower 8 bits are selected from the
1045	/// 128-bit integer vector parameter and the remaining bits are assigned
1046	/// zeros.
1047	#define _mm_extract_epi8(X, N) \
1048	(int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
1049	(int)(N))
1050
1051	/// Extracts a 32-bit element from the 128-bit integer vector of
1052	/// [4 x i32], using the immediate value parameter \a N as a selector.
1053	///
1054	/// \headerfile <x86intrin.h>
1055	///
1056	/// \code
1057	/// int _mm_extract_epi32(__m128i X, const int N);
1058	/// \endcode
1059	///
1060	/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
1061	///
1062	/// \param X
1063	/// A 128-bit integer vector.
1064	/// \param N
1065	/// An immediate value. Bits [1:0] specify which 32-bit vector element from
1066	/// the argument \a X to extract and copy to the result. \n
1067	/// 00: Bits [31:0] of the parameter \a X are extracted. \n
1068	/// 01: Bits [63:32] of the parameter \a X are extracted. \n
1069	/// 10: Bits [95:64] of the parameter \a X are extracted. \n
1070	/// 11: Bits [127:96] of the parameter \a X are exracted.
1071	/// \returns An integer, whose lower 32 bits are selected from the 128-bit
1072	/// integer vector parameter and the remaining bits are assigned zeros.
1073	#define _mm_extract_epi32(X, N) \
1074	(int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))
1075
1076	#ifdef __x86_64__
1077	/// Extracts a 64-bit element from the 128-bit integer vector of
1078	/// [2 x i64], using the immediate value parameter \a N as a selector.
1079	///
1080	/// \headerfile <x86intrin.h>
1081	///
1082	/// \code
1083	/// long long _mm_extract_epi64(__m128i X, const int N);
1084	/// \endcode
1085	///
1086	/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
1087	///
1088	/// \param X
1089	/// A 128-bit integer vector.
1090	/// \param N
1091	/// An immediate value. Bit [0] specifies which 64-bit vector element from
1092	/// the argument \a X to return. \n
1093	/// 0: Bits [63:0] are returned. \n
1094	/// 1: Bits [127:64] are returned. \n
1095	/// \returns A 64-bit integer.
1096	#define _mm_extract_epi64(X, N) \
1097	(long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))
1098	#endif /* __x86_64 */
1099
1100	/* SSE4 128-bit Packed Integer Comparisons. */
1101	/// Tests whether the specified bits in a 128-bit integer vector are all
1102	/// zeros.
1103	///
1104	/// \headerfile <x86intrin.h>
1105	///
1106	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1107	///
1108	/// \param __M
1109	/// A 128-bit integer vector containing the bits to be tested.
1110	/// \param __V
1111	/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1112	/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1113	static __inline__ int __DEFAULT_FN_ATTRS
1114	_mm_testz_si128(__m128i __M, __m128i __V)
1115	{
1116	return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
1117	}
1118
1119	/// Tests whether the specified bits in a 128-bit integer vector are all
1120	/// ones.
1121	///
1122	/// \headerfile <x86intrin.h>
1123	///
1124	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1125	///
1126	/// \param __M
1127	/// A 128-bit integer vector containing the bits to be tested.
1128	/// \param __V
1129	/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1130	/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
1131	static __inline__ int __DEFAULT_FN_ATTRS
1132	_mm_testc_si128(__m128i __M, __m128i __V)
1133	{
1134	return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
1135	}
1136
1137	/// Tests whether the specified bits in a 128-bit integer vector are
1138	/// neither all zeros nor all ones.
1139	///
1140	/// \headerfile <x86intrin.h>
1141	///
1142	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1143	///
1144	/// \param __M
1145	/// A 128-bit integer vector containing the bits to be tested.
1146	/// \param __V
1147	/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1148	/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1149	/// FALSE otherwise.
1150	static __inline__ int __DEFAULT_FN_ATTRS
1151	_mm_testnzc_si128(__m128i __M, __m128i __V)
1152	{
1153	return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
1154	}
1155
1156	/// Tests whether the specified bits in a 128-bit integer vector are all
1157	/// ones.
1158	///
1159	/// \headerfile <x86intrin.h>
1160	///
1161	/// \code
1162	/// int _mm_test_all_ones(__m128i V);
1163	/// \endcode
1164	///
1165	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1166	///
1167	/// \param V
1168	/// A 128-bit integer vector containing the bits to be tested.
1169	/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
1170	/// otherwise.
1171	#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
1172
1173	/// Tests whether the specified bits in a 128-bit integer vector are
1174	/// neither all zeros nor all ones.
1175	///
1176	/// \headerfile <x86intrin.h>
1177	///
1178	/// \code
1179	/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
1180	/// \endcode
1181	///
1182	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1183	///
1184	/// \param M
1185	/// A 128-bit integer vector containing the bits to be tested.
1186	/// \param V
1187	/// A 128-bit integer vector selecting which bits to test in operand \a M.
1188	/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1189	/// FALSE otherwise.
1190	#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
1191
1192	/// Tests whether the specified bits in a 128-bit integer vector are all
1193	/// zeros.
1194	///
1195	/// \headerfile <x86intrin.h>
1196	///
1197	/// \code
1198	/// int _mm_test_all_zeros(__m128i M, __m128i V);
1199	/// \endcode
1200	///
1201	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1202	///
1203	/// \param M
1204	/// A 128-bit integer vector containing the bits to be tested.
1205	/// \param V
1206	/// A 128-bit integer vector selecting which bits to test in operand \a M.
1207	/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1208	#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
1209
1210	/* SSE4 64-bit Packed Integer Comparisons. */
1211	/// Compares each of the corresponding 64-bit values of the 128-bit
1212	/// integer vectors for equality.
1213	///
1214	/// \headerfile <x86intrin.h>
1215	///
1216	/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
1217	///
1218	/// \param __V1
1219	/// A 128-bit integer vector.
1220	/// \param __V2
1221	/// A 128-bit integer vector.
1222	/// \returns A 128-bit integer vector containing the comparison results.
1223	static __inline__ __m128i __DEFAULT_FN_ATTRS
1224	_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
1225	{
1226	return (__m128i)((__v2di)__V1 == (__v2di)__V2);
1227	}
1228
1229	/* SSE4 Packed Integer Sign-Extension. */
1230	/// Sign-extends each of the lower eight 8-bit integer elements of a
1231	/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1232	/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1233	/// are unused.
1234	///
1235	/// \headerfile <x86intrin.h>
1236	///
1237	/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
1238	///
1239	/// \param __V
1240	/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign-
1241	/// extended to 16-bit values.
1242	/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
1243	static __inline__ __m128i __DEFAULT_FN_ATTRS
1244	_mm_cvtepi8_epi16(__m128i __V)
1245	{
1246	/* This function always performs a signed extension, but __v16qi is a char
1247	which may be signed or unsigned, so use __v16qs. */
1248	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
1249	}
1250
1251	/// Sign-extends each of the lower four 8-bit integer elements of a
1252	/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1253	/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1254	/// vector are unused.
1255	///
1256	/// \headerfile <x86intrin.h>
1257	///
1258	/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
1259	///
1260	/// \param __V
1261	/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1262	/// sign-extended to 32-bit values.
1263	/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1264	static __inline__ __m128i __DEFAULT_FN_ATTRS
1265	_mm_cvtepi8_epi32(__m128i __V)
1266	{
1267	/* This function always performs a signed extension, but __v16qi is a char
1268	which may be signed or unsigned, so use __v16qs. */
1269	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
1270	}
1271
1272	/// Sign-extends each of the lower two 8-bit integer elements of a
1273	/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1274	/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1275	/// vector are unused.
1276	///
1277	/// \headerfile <x86intrin.h>
1278	///
1279	/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
1280	///
1281	/// \param __V
1282	/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1283	/// sign-extended to 64-bit values.
1284	/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1285	static __inline__ __m128i __DEFAULT_FN_ATTRS
1286	_mm_cvtepi8_epi64(__m128i __V)
1287	{
1288	/* This function always performs a signed extension, but __v16qi is a char
1289	which may be signed or unsigned, so use __v16qs. */
1290	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
1291	}
1292
1293	/// Sign-extends each of the lower four 16-bit integer elements of a
1294	/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1295	/// a 128-bit vector of [4 x i32]. The upper four elements of the input
1296	/// vector are unused.
1297	///
1298	/// \headerfile <x86intrin.h>
1299	///
1300	/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
1301	///
1302	/// \param __V
1303	/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1304	/// sign-extended to 32-bit values.
1305	/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1306	static __inline__ __m128i __DEFAULT_FN_ATTRS
1307	_mm_cvtepi16_epi32(__m128i __V)
1308	{
1309	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
1310	}
1311
1312	/// Sign-extends each of the lower two 16-bit integer elements of a
1313	/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1314	/// a 128-bit vector of [2 x i64]. The upper six elements of the input
1315	/// vector are unused.
1316	///
1317	/// \headerfile <x86intrin.h>
1318	///
1319	/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
1320	///
1321	/// \param __V
1322	/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1323	/// sign-extended to 64-bit values.
1324	/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1325	static __inline__ __m128i __DEFAULT_FN_ATTRS
1326	_mm_cvtepi16_epi64(__m128i __V)
1327	{
1328	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
1329	}
1330
1331	/// Sign-extends each of the lower two 32-bit integer elements of a
1332	/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1333	/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1334	/// are unused.
1335	///
1336	/// \headerfile <x86intrin.h>
1337	///
1338	/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
1339	///
1340	/// \param __V
1341	/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1342	/// sign-extended to 64-bit values.
1343	/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1344	static __inline__ __m128i __DEFAULT_FN_ATTRS
1345	_mm_cvtepi32_epi64(__m128i __V)
1346	{
1347	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
1348	}
1349
1350	/* SSE4 Packed Integer Zero-Extension. */
1351	/// Zero-extends each of the lower eight 8-bit integer elements of a
1352	/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1353	/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1354	/// are unused.
1355	///
1356	/// \headerfile <x86intrin.h>
1357	///
1358	/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
1359	///
1360	/// \param __V
1361	/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
1362	/// zero-extended to 16-bit values.
1363	/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
1364	static __inline__ __m128i __DEFAULT_FN_ATTRS
1365	_mm_cvtepu8_epi16(__m128i __V)
1366	{
1367	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
1368	}
1369
1370	/// Zero-extends each of the lower four 8-bit integer elements of a
1371	/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1372	/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1373	/// vector are unused.
1374	///
1375	/// \headerfile <x86intrin.h>
1376	///
1377	/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
1378	///
1379	/// \param __V
1380	/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1381	/// zero-extended to 32-bit values.
1382	/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1383	static __inline__ __m128i __DEFAULT_FN_ATTRS
1384	_mm_cvtepu8_epi32(__m128i __V)
1385	{
1386	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
1387	}
1388
1389	/// Zero-extends each of the lower two 8-bit integer elements of a
1390	/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1391	/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1392	/// vector are unused.
1393	///
1394	/// \headerfile <x86intrin.h>
1395	///
1396	/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
1397	///
1398	/// \param __V
1399	/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1400	/// zero-extended to 64-bit values.
1401	/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1402	static __inline__ __m128i __DEFAULT_FN_ATTRS
1403	_mm_cvtepu8_epi64(__m128i __V)
1404	{
1405	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
1406	}
1407
1408	/// Zero-extends each of the lower four 16-bit integer elements of a
1409	/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1410	/// a 128-bit vector of [4 x i32]. The upper four elements of the input
1411	/// vector are unused.
1412	///
1413	/// \headerfile <x86intrin.h>
1414	///
1415	/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
1416	///
1417	/// \param __V
1418	/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1419	/// zero-extended to 32-bit values.
1420	/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1421	static __inline__ __m128i __DEFAULT_FN_ATTRS
1422	_mm_cvtepu16_epi32(__m128i __V)
1423	{
1424	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
1425	}
1426
1427	/// Zero-extends each of the lower two 16-bit integer elements of a
1428	/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1429	/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector
1430	/// are unused.
1431	///
1432	/// \headerfile <x86intrin.h>
1433	///
1434	/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
1435	///
1436	/// \param __V
1437	/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1438	/// zero-extended to 64-bit values.
1439	/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1440	static __inline__ __m128i __DEFAULT_FN_ATTRS
1441	_mm_cvtepu16_epi64(__m128i __V)
1442	{
1443	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
1444	}
1445
1446	/// Zero-extends each of the lower two 32-bit integer elements of a
1447	/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1448	/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1449	/// are unused.
1450	///
1451	/// \headerfile <x86intrin.h>
1452	///
1453	/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
1454	///
1455	/// \param __V
1456	/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1457	/// zero-extended to 64-bit values.
1458	/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1459	static __inline__ __m128i __DEFAULT_FN_ATTRS
1460	_mm_cvtepu32_epi64(__m128i __V)
1461	{
1462	return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
1463	}
1464
1465	/* SSE4 Pack with Unsigned Saturation. */
1466	/// Converts 32-bit signed integers from both 128-bit integer vector
1467	/// operands into 16-bit unsigned integers, and returns the packed result.
1468	/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
1469	/// 0x0000 are saturated to 0x0000.
1470	///
1471	/// \headerfile <x86intrin.h>
1472	///
1473	/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
1474	///
1475	/// \param __V1
1476	/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
1477	/// signed integer and is converted to a 16-bit unsigned integer with
1478	/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
1479	/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
1480	/// are written to the lower 64 bits of the result.
1481	/// \param __V2
1482	/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
1483	/// signed integer and is converted to a 16-bit unsigned integer with
1484	/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
1485	/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
1486	/// are written to the higher 64 bits of the result.
1487	/// \returns A 128-bit vector of [8 x i16] containing the converted values.
1488	static __inline__ __m128i __DEFAULT_FN_ATTRS
1489	_mm_packus_epi32(__m128i __V1, __m128i __V2)
1490	{
1491	return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
1492	}
1493
1494	/* SSE4 Multiple Packed Sums of Absolute Difference. */
1495	/// Subtracts 8-bit unsigned integer values and computes the absolute
1496	/// values of the differences to the corresponding bits in the destination.
1497	/// Then sums of the absolute differences are returned according to the bit
1498	/// fields in the immediate operand.
1499	///
1500	/// \headerfile <x86intrin.h>
1501	///
1502	/// \code
1503	/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
1504	/// \endcode
1505	///
1506	/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
1507	///
1508	/// \param X
1509	/// A 128-bit vector of [16 x i8].
1510	/// \param Y
1511	/// A 128-bit vector of [16 x i8].
1512	/// \param M
1513	/// An 8-bit immediate operand specifying how the absolute differences are to
1514	/// be calculated, according to the following algorithm:
1515	/// \code
1516	/// // M2 represents bit 2 of the immediate operand
1517	/// // M10 represents bits [1:0] of the immediate operand
1518	/// i = M2 * 4;
1519	/// j = M10 * 4;
1520	/// for (k = 0; k < 8; k = k + 1) {
1521	/// d0 = abs(X[i + k + 0] - Y[j + 0]);
1522	/// d1 = abs(X[i + k + 1] - Y[j + 1]);
1523	/// d2 = abs(X[i + k + 2] - Y[j + 2]);
1524	/// d3 = abs(X[i + k + 3] - Y[j + 3]);
1525	/// r[k] = d0 + d1 + d2 + d3;
1526	/// }
1527	/// \endcode
1528	/// \returns A 128-bit integer vector containing the sums of the sets of
1529	/// absolute differences between both operands.
1530	#define _mm_mpsadbw_epu8(X, Y, M) \
1531	(__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
1532	(__v16qi)(__m128i)(Y), (M))
1533
1534	/// Finds the minimum unsigned 16-bit element in the input 128-bit
1535	/// vector of [8 x u16] and returns it and along with its index.
1536	///
1537	/// \headerfile <x86intrin.h>
1538	///
1539	/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
1540	/// instruction.
1541	///
1542	/// \param __V
1543	/// A 128-bit vector of [8 x u16].
1544	/// \returns A 128-bit value where bits [15:0] contain the minimum value found
1545	/// in parameter \a __V, bits [18:16] contain the index of the minimum value
1546	/// and the remaining bits are set to 0.
1547	static __inline__ __m128i __DEFAULT_FN_ATTRS
1548	_mm_minpos_epu16(__m128i __V)
1549	{
1550	return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
1551	}
1552
1553	/* Handle the sse4.2 definitions here. */
1554
1555	/* These definitions are normally in nmmintrin.h, but gcc puts them in here
1556	so we'll do the same. */
1557
1558	#undef __DEFAULT_FN_ATTRS
1559	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
1560
1561	/* These specify the type of data that we're comparing. */
1562	#define _SIDD_UBYTE_OPS 0x00
1563	#define _SIDD_UWORD_OPS 0x01
1564	#define _SIDD_SBYTE_OPS 0x02
1565	#define _SIDD_SWORD_OPS 0x03
1566
1567	/* These specify the type of comparison operation. */
1568	#define _SIDD_CMP_EQUAL_ANY 0x00
1569	#define _SIDD_CMP_RANGES 0x04
1570	#define _SIDD_CMP_EQUAL_EACH 0x08
1571	#define _SIDD_CMP_EQUAL_ORDERED 0x0c
1572
1573	/* These macros specify the polarity of the operation. */
1574	#define _SIDD_POSITIVE_POLARITY 0x00
1575	#define _SIDD_NEGATIVE_POLARITY 0x10
1576	#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
1577	#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
1578
1579	/* These macros are used in _mm_cmpXstri() to specify the return. */
1580	#define _SIDD_LEAST_SIGNIFICANT 0x00
1581	#define _SIDD_MOST_SIGNIFICANT 0x40
1582
1583	/* These macros are used in _mm_cmpXstri() to specify the return. */
1584	#define _SIDD_BIT_MASK 0x00
1585	#define _SIDD_UNIT_MASK 0x40
1586
1587	/* SSE4.2 Packed Comparison Intrinsics. */
1588	/// Uses the immediate operand \a M to perform a comparison of string
1589	/// data with implicitly defined lengths that is contained in source operands
1590	/// \a A and \a B. Returns a 128-bit integer vector representing the result
1591	/// mask of the comparison.
1592	///
1593	/// \headerfile <x86intrin.h>
1594	///
1595	/// \code
1596	/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
1597	/// \endcode
1598	///
1599	/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
1600	/// instruction.
1601	///
1602	/// \param A
1603	/// A 128-bit integer vector containing one of the source operands to be
1604	/// compared.
1605	/// \param B
1606	/// A 128-bit integer vector containing one of the source operands to be
1607	/// compared.
1608	/// \param M
1609	/// An 8-bit immediate operand specifying whether the characters are bytes or
1610	/// words, the type of comparison to perform, and the format of the return
1611	/// value. \n
1612	/// Bits [1:0]: Determine source data format. \n
1613	/// 00: 16 unsigned bytes \n
1614	/// 01: 8 unsigned words \n
1615	/// 10: 16 signed bytes \n
1616	/// 11: 8 signed words \n
1617	/// Bits [3:2]: Determine comparison type and aggregation method. \n
1618	/// 00: Subset: Each character in \a B is compared for equality with all
1619	/// the characters in \a A. \n
1620	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1621	/// basis is greater than or equal for even-indexed elements in \a A,
1622	/// and less than or equal for odd-indexed elements in \a A. \n
1623	/// 10: Match: Compare each pair of corresponding characters in \a A and
1624	/// \a B for equality. \n
1625	/// 11: Substring: Search \a B for substring matches of \a A. \n
1626	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1627	/// mask of the comparison results. \n
1628	/// 00: No effect. \n
1629	/// 01: Negate the bit mask. \n
1630	/// 10: No effect. \n
1631	/// 11: Negate the bit mask only for bits with an index less than or equal
1632	/// to the size of \a A or \a B. \n
1633	/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1634	/// bytes. \n
1635	/// 0: The result is zero-extended to 16 bytes. \n
1636	/// 1: The result is expanded to 16 bytes (this expansion is performed by
1637	/// repeating each bit 8 or 16 times).
1638	/// \returns Returns a 128-bit integer vector representing the result mask of
1639	/// the comparison.
1640	#define _mm_cmpistrm(A, B, M) \
1641	(__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
1642	(__v16qi)(__m128i)(B), (int)(M))
1643
1644	/// Uses the immediate operand \a M to perform a comparison of string
1645	/// data with implicitly defined lengths that is contained in source operands
1646	/// \a A and \a B. Returns an integer representing the result index of the
1647	/// comparison.
1648	///
1649	/// \headerfile <x86intrin.h>
1650	///
1651	/// \code
1652	/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
1653	/// \endcode
1654	///
1655	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1656	/// instruction.
1657	///
1658	/// \param A
1659	/// A 128-bit integer vector containing one of the source operands to be
1660	/// compared.
1661	/// \param B
1662	/// A 128-bit integer vector containing one of the source operands to be
1663	/// compared.
1664	/// \param M
1665	/// An 8-bit immediate operand specifying whether the characters are bytes or
1666	/// words, the type of comparison to perform, and the format of the return
1667	/// value. \n
1668	/// Bits [1:0]: Determine source data format. \n
1669	/// 00: 16 unsigned bytes \n
1670	/// 01: 8 unsigned words \n
1671	/// 10: 16 signed bytes \n
1672	/// 11: 8 signed words \n
1673	/// Bits [3:2]: Determine comparison type and aggregation method. \n
1674	/// 00: Subset: Each character in \a B is compared for equality with all
1675	/// the characters in \a A. \n
1676	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1677	/// basis is greater than or equal for even-indexed elements in \a A,
1678	/// and less than or equal for odd-indexed elements in \a A. \n
1679	/// 10: Match: Compare each pair of corresponding characters in \a A and
1680	/// \a B for equality. \n
1681	/// 11: Substring: Search B for substring matches of \a A. \n
1682	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1683	/// mask of the comparison results. \n
1684	/// 00: No effect. \n
1685	/// 01: Negate the bit mask. \n
1686	/// 10: No effect. \n
1687	/// 11: Negate the bit mask only for bits with an index less than or equal
1688	/// to the size of \a A or \a B. \n
1689	/// Bit [6]: Determines whether the index of the lowest set bit or the
1690	/// highest set bit is returned. \n
1691	/// 0: The index of the least significant set bit. \n
1692	/// 1: The index of the most significant set bit. \n
1693	/// \returns Returns an integer representing the result index of the comparison.
1694	#define _mm_cmpistri(A, B, M) \
1695	(int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
1696	(__v16qi)(__m128i)(B), (int)(M))
1697
1698	/// Uses the immediate operand \a M to perform a comparison of string
1699	/// data with explicitly defined lengths that is contained in source operands
1700	/// \a A and \a B. Returns a 128-bit integer vector representing the result
1701	/// mask of the comparison.
1702	///
1703	/// \headerfile <x86intrin.h>
1704	///
1705	/// \code
1706	/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
1707	/// \endcode
1708	///
1709	/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
1710	/// instruction.
1711	///
1712	/// \param A
1713	/// A 128-bit integer vector containing one of the source operands to be
1714	/// compared.
1715	/// \param LA
1716	/// An integer that specifies the length of the string in \a A.
1717	/// \param B
1718	/// A 128-bit integer vector containing one of the source operands to be
1719	/// compared.
1720	/// \param LB
1721	/// An integer that specifies the length of the string in \a B.
1722	/// \param M
1723	/// An 8-bit immediate operand specifying whether the characters are bytes or
1724	/// words, the type of comparison to perform, and the format of the return
1725	/// value. \n
1726	/// Bits [1:0]: Determine source data format. \n
1727	/// 00: 16 unsigned bytes \n
1728	/// 01: 8 unsigned words \n
1729	/// 10: 16 signed bytes \n
1730	/// 11: 8 signed words \n
1731	/// Bits [3:2]: Determine comparison type and aggregation method. \n
1732	/// 00: Subset: Each character in \a B is compared for equality with all
1733	/// the characters in \a A. \n
1734	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1735	/// basis is greater than or equal for even-indexed elements in \a A,
1736	/// and less than or equal for odd-indexed elements in \a A. \n
1737	/// 10: Match: Compare each pair of corresponding characters in \a A and
1738	/// \a B for equality. \n
1739	/// 11: Substring: Search \a B for substring matches of \a A. \n
1740	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1741	/// mask of the comparison results. \n
1742	/// 00: No effect. \n
1743	/// 01: Negate the bit mask. \n
1744	/// 10: No effect. \n
1745	/// 11: Negate the bit mask only for bits with an index less than or equal
1746	/// to the size of \a A or \a B. \n
1747	/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1748	/// bytes. \n
1749	/// 0: The result is zero-extended to 16 bytes. \n
1750	/// 1: The result is expanded to 16 bytes (this expansion is performed by
1751	/// repeating each bit 8 or 16 times). \n
1752	/// \returns Returns a 128-bit integer vector representing the result mask of
1753	/// the comparison.
1754	#define _mm_cmpestrm(A, LA, B, LB, M) \
1755	(__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
1756	(__v16qi)(__m128i)(B), (int)(LB), \
1757	(int)(M))
1758
1759	/// Uses the immediate operand \a M to perform a comparison of string
1760	/// data with explicitly defined lengths that is contained in source operands
1761	/// \a A and \a B. Returns an integer representing the result index of the
1762	/// comparison.
1763	///
1764	/// \headerfile <x86intrin.h>
1765	///
1766	/// \code
1767	/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
1768	/// \endcode
1769	///
1770	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
1771	/// instruction.
1772	///
1773	/// \param A
1774	/// A 128-bit integer vector containing one of the source operands to be
1775	/// compared.
1776	/// \param LA
1777	/// An integer that specifies the length of the string in \a A.
1778	/// \param B
1779	/// A 128-bit integer vector containing one of the source operands to be
1780	/// compared.
1781	/// \param LB
1782	/// An integer that specifies the length of the string in \a B.
1783	/// \param M
1784	/// An 8-bit immediate operand specifying whether the characters are bytes or
1785	/// words, the type of comparison to perform, and the format of the return
1786	/// value. \n
1787	/// Bits [1:0]: Determine source data format. \n
1788	/// 00: 16 unsigned bytes \n
1789	/// 01: 8 unsigned words \n
1790	/// 10: 16 signed bytes \n
1791	/// 11: 8 signed words \n
1792	/// Bits [3:2]: Determine comparison type and aggregation method. \n
1793	/// 00: Subset: Each character in \a B is compared for equality with all
1794	/// the characters in \a A. \n
1795	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1796	/// basis is greater than or equal for even-indexed elements in \a A,
1797	/// and less than or equal for odd-indexed elements in \a A. \n
1798	/// 10: Match: Compare each pair of corresponding characters in \a A and
1799	/// \a B for equality. \n
1800	/// 11: Substring: Search B for substring matches of \a A. \n
1801	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1802	/// mask of the comparison results. \n
1803	/// 00: No effect. \n
1804	/// 01: Negate the bit mask. \n
1805	/// 10: No effect. \n
1806	/// 11: Negate the bit mask only for bits with an index less than or equal
1807	/// to the size of \a A or \a B. \n
1808	/// Bit [6]: Determines whether the index of the lowest set bit or the
1809	/// highest set bit is returned. \n
1810	/// 0: The index of the least significant set bit. \n
1811	/// 1: The index of the most significant set bit. \n
1812	/// \returns Returns an integer representing the result index of the comparison.
1813	#define _mm_cmpestri(A, LA, B, LB, M) \
1814	(int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
1815	(__v16qi)(__m128i)(B), (int)(LB), \
1816	(int)(M))
1817
1818	/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
1819	/// Uses the immediate operand \a M to perform a comparison of string
1820	/// data with implicitly defined lengths that is contained in source operands
1821	/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
1822	/// string in \a B is the maximum, otherwise, returns 0.
1823	///
1824	/// \headerfile <x86intrin.h>
1825	///
1826	/// \code
1827	/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
1828	/// \endcode
1829	///
1830	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1831	/// instruction.
1832	///
1833	/// \param A
1834	/// A 128-bit integer vector containing one of the source operands to be
1835	/// compared.
1836	/// \param B
1837	/// A 128-bit integer vector containing one of the source operands to be
1838	/// compared.
1839	/// \param M
1840	/// An 8-bit immediate operand specifying whether the characters are bytes or
1841	/// words and the type of comparison to perform. \n
1842	/// Bits [1:0]: Determine source data format. \n
1843	/// 00: 16 unsigned bytes \n
1844	/// 01: 8 unsigned words \n
1845	/// 10: 16 signed bytes \n
1846	/// 11: 8 signed words \n
1847	/// Bits [3:2]: Determine comparison type and aggregation method. \n
1848	/// 00: Subset: Each character in \a B is compared for equality with all
1849	/// the characters in \a A. \n
1850	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1851	/// basis is greater than or equal for even-indexed elements in \a A,
1852	/// and less than or equal for odd-indexed elements in \a A. \n
1853	/// 10: Match: Compare each pair of corresponding characters in \a A and
1854	/// \a B for equality. \n
1855	/// 11: Substring: Search \a B for substring matches of \a A. \n
1856	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1857	/// mask of the comparison results. \n
1858	/// 00: No effect. \n
1859	/// 01: Negate the bit mask. \n
1860	/// 10: No effect. \n
1861	/// 11: Negate the bit mask only for bits with an index less than or equal
1862	/// to the size of \a A or \a B. \n
1863	/// \returns Returns 1 if the bit mask is zero and the length of the string in
1864	/// \a B is the maximum; otherwise, returns 0.
1865	#define _mm_cmpistra(A, B, M) \
1866	(int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
1867	(__v16qi)(__m128i)(B), (int)(M))
1868
1869	/// Uses the immediate operand \a M to perform a comparison of string
1870	/// data with implicitly defined lengths that is contained in source operands
1871	/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
1872	/// 0.
1873	///
1874	/// \headerfile <x86intrin.h>
1875	///
1876	/// \code
1877	/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
1878	/// \endcode
1879	///
1880	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1881	/// instruction.
1882	///
1883	/// \param A
1884	/// A 128-bit integer vector containing one of the source operands to be
1885	/// compared.
1886	/// \param B
1887	/// A 128-bit integer vector containing one of the source operands to be
1888	/// compared.
1889	/// \param M
1890	/// An 8-bit immediate operand specifying whether the characters are bytes or
1891	/// words and the type of comparison to perform. \n
1892	/// Bits [1:0]: Determine source data format. \n
1893	/// 00: 16 unsigned bytes \n
1894	/// 01: 8 unsigned words \n
1895	/// 10: 16 signed bytes \n
1896	/// 11: 8 signed words \n
1897	/// Bits [3:2]: Determine comparison type and aggregation method. \n
1898	/// 00: Subset: Each character in \a B is compared for equality with all
1899	/// the characters in \a A. \n
1900	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1901	/// basis is greater than or equal for even-indexed elements in \a A,
1902	/// and less than or equal for odd-indexed elements in \a A. \n
1903	/// 10: Match: Compare each pair of corresponding characters in \a A and
1904	/// \a B for equality. \n
1905	/// 11: Substring: Search B for substring matches of \a A. \n
1906	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1907	/// mask of the comparison results. \n
1908	/// 00: No effect. \n
1909	/// 01: Negate the bit mask. \n
1910	/// 10: No effect. \n
1911	/// 11: Negate the bit mask only for bits with an index less than or equal
1912	/// to the size of \a A or \a B.
1913	/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
1914	#define _mm_cmpistrc(A, B, M) \
1915	(int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
1916	(__v16qi)(__m128i)(B), (int)(M))
1917
1918	/// Uses the immediate operand \a M to perform a comparison of string
1919	/// data with implicitly defined lengths that is contained in source operands
1920	/// \a A and \a B. Returns bit 0 of the resulting bit mask.
1921	///
1922	/// \headerfile <x86intrin.h>
1923	///
1924	/// \code
1925	/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
1926	/// \endcode
1927	///
1928	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1929	/// instruction.
1930	///
1931	/// \param A
1932	/// A 128-bit integer vector containing one of the source operands to be
1933	/// compared.
1934	/// \param B
1935	/// A 128-bit integer vector containing one of the source operands to be
1936	/// compared.
1937	/// \param M
1938	/// An 8-bit immediate operand specifying whether the characters are bytes or
1939	/// words and the type of comparison to perform. \n
1940	/// Bits [1:0]: Determine source data format. \n
1941	/// 00: 16 unsigned bytes \n
1942	/// 01: 8 unsigned words \n
1943	/// 10: 16 signed bytes \n
1944	/// 11: 8 signed words \n
1945	/// Bits [3:2]: Determine comparison type and aggregation method. \n
1946	/// 00: Subset: Each character in \a B is compared for equality with all
1947	/// the characters in \a A. \n
1948	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1949	/// basis is greater than or equal for even-indexed elements in \a A,
1950	/// and less than or equal for odd-indexed elements in \a A. \n
1951	/// 10: Match: Compare each pair of corresponding characters in \a A and
1952	/// \a B for equality. \n
1953	/// 11: Substring: Search B for substring matches of \a A. \n
1954	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1955	/// mask of the comparison results. \n
1956	/// 00: No effect. \n
1957	/// 01: Negate the bit mask. \n
1958	/// 10: No effect. \n
1959	/// 11: Negate the bit mask only for bits with an index less than or equal
1960	/// to the size of \a A or \a B. \n
1961	/// \returns Returns bit 0 of the resulting bit mask.
1962	#define _mm_cmpistro(A, B, M) \
1963	(int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
1964	(__v16qi)(__m128i)(B), (int)(M))
1965
1966	/// Uses the immediate operand \a M to perform a comparison of string
1967	/// data with implicitly defined lengths that is contained in source operands
1968	/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
1969	/// the maximum, otherwise, returns 0.
1970	///
1971	/// \headerfile <x86intrin.h>
1972	///
1973	/// \code
1974	/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
1975	/// \endcode
1976	///
1977	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1978	/// instruction.
1979	///
1980	/// \param A
1981	/// A 128-bit integer vector containing one of the source operands to be
1982	/// compared.
1983	/// \param B
1984	/// A 128-bit integer vector containing one of the source operands to be
1985	/// compared.
1986	/// \param M
1987	/// An 8-bit immediate operand specifying whether the characters are bytes or
1988	/// words and the type of comparison to perform. \n
1989	/// Bits [1:0]: Determine source data format. \n
1990	/// 00: 16 unsigned bytes \n
1991	/// 01: 8 unsigned words \n
1992	/// 10: 16 signed bytes \n
1993	/// 11: 8 signed words \n
1994	/// Bits [3:2]: Determine comparison type and aggregation method. \n
1995	/// 00: Subset: Each character in \a B is compared for equality with all
1996	/// the characters in \a A. \n
1997	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1998	/// basis is greater than or equal for even-indexed elements in \a A,
1999	/// and less than or equal for odd-indexed elements in \a A. \n
2000	/// 10: Match: Compare each pair of corresponding characters in \a A and
2001	/// \a B for equality. \n
2002	/// 11: Substring: Search \a B for substring matches of \a A. \n
2003	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2004	/// mask of the comparison results. \n
2005	/// 00: No effect. \n
2006	/// 01: Negate the bit mask. \n
2007	/// 10: No effect. \n
2008	/// 11: Negate the bit mask only for bits with an index less than or equal
2009	/// to the size of \a A or \a B. \n
2010	/// \returns Returns 1 if the length of the string in \a A is less than the
2011	/// maximum, otherwise, returns 0.
2012	#define _mm_cmpistrs(A, B, M) \
2013	(int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
2014	(__v16qi)(__m128i)(B), (int)(M))
2015
2016	/// Uses the immediate operand \a M to perform a comparison of string
2017	/// data with implicitly defined lengths that is contained in source operands
2018	/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
2019	/// the maximum, otherwise, returns 0.
2020	///
2021	/// \headerfile <x86intrin.h>
2022	///
2023	/// \code
2024	/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
2025	/// \endcode
2026	///
2027	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
2028	/// instruction.
2029	///
2030	/// \param A
2031	/// A 128-bit integer vector containing one of the source operands to be
2032	/// compared.
2033	/// \param B
2034	/// A 128-bit integer vector containing one of the source operands to be
2035	/// compared.
2036	/// \param M
2037	/// An 8-bit immediate operand specifying whether the characters are bytes or
2038	/// words and the type of comparison to perform. \n
2039	/// Bits [1:0]: Determine source data format. \n
2040	/// 00: 16 unsigned bytes \n
2041	/// 01: 8 unsigned words \n
2042	/// 10: 16 signed bytes \n
2043	/// 11: 8 signed words \n
2044	/// Bits [3:2]: Determine comparison type and aggregation method. \n
2045	/// 00: Subset: Each character in \a B is compared for equality with all
2046	/// the characters in \a A. \n
2047	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2048	/// basis is greater than or equal for even-indexed elements in \a A,
2049	/// and less than or equal for odd-indexed elements in \a A. \n
2050	/// 10: Match: Compare each pair of corresponding characters in \a A and
2051	/// \a B for equality. \n
2052	/// 11: Substring: Search \a B for substring matches of \a A. \n
2053	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2054	/// mask of the comparison results. \n
2055	/// 00: No effect. \n
2056	/// 01: Negate the bit mask. \n
2057	/// 10: No effect. \n
2058	/// 11: Negate the bit mask only for bits with an index less than or equal
2059	/// to the size of \a A or \a B.
2060	/// \returns Returns 1 if the length of the string in \a B is less than the
2061	/// maximum, otherwise, returns 0.
2062	#define _mm_cmpistrz(A, B, M) \
2063	(int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
2064	(__v16qi)(__m128i)(B), (int)(M))
2065
2066	/// Uses the immediate operand \a M to perform a comparison of string
2067	/// data with explicitly defined lengths that is contained in source operands
2068	/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
2069	/// string in \a B is the maximum, otherwise, returns 0.
2070	///
2071	/// \headerfile <x86intrin.h>
2072	///
2073	/// \code
2074	/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
2075	/// \endcode
2076	///
2077	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2078	/// instruction.
2079	///
2080	/// \param A
2081	/// A 128-bit integer vector containing one of the source operands to be
2082	/// compared.
2083	/// \param LA
2084	/// An integer that specifies the length of the string in \a A.
2085	/// \param B
2086	/// A 128-bit integer vector containing one of the source operands to be
2087	/// compared.
2088	/// \param LB
2089	/// An integer that specifies the length of the string in \a B.
2090	/// \param M
2091	/// An 8-bit immediate operand specifying whether the characters are bytes or
2092	/// words and the type of comparison to perform. \n
2093	/// Bits [1:0]: Determine source data format. \n
2094	/// 00: 16 unsigned bytes \n
2095	/// 01: 8 unsigned words \n
2096	/// 10: 16 signed bytes \n
2097	/// 11: 8 signed words \n
2098	/// Bits [3:2]: Determine comparison type and aggregation method. \n
2099	/// 00: Subset: Each character in \a B is compared for equality with all
2100	/// the characters in \a A. \n
2101	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2102	/// basis is greater than or equal for even-indexed elements in \a A,
2103	/// and less than or equal for odd-indexed elements in \a A. \n
2104	/// 10: Match: Compare each pair of corresponding characters in \a A and
2105	/// \a B for equality. \n
2106	/// 11: Substring: Search \a B for substring matches of \a A. \n
2107	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2108	/// mask of the comparison results. \n
2109	/// 00: No effect. \n
2110	/// 01: Negate the bit mask. \n
2111	/// 10: No effect. \n
2112	/// 11: Negate the bit mask only for bits with an index less than or equal
2113	/// to the size of \a A or \a B.
2114	/// \returns Returns 1 if the bit mask is zero and the length of the string in
2115	/// \a B is the maximum, otherwise, returns 0.
2116	#define _mm_cmpestra(A, LA, B, LB, M) \
2117	(int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
2118	(__v16qi)(__m128i)(B), (int)(LB), \
2119	(int)(M))
2120
2121	/// Uses the immediate operand \a M to perform a comparison of string
2122	/// data with explicitly defined lengths that is contained in source operands
2123	/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
2124	/// returns 0.
2125	///
2126	/// \headerfile <x86intrin.h>
2127	///
2128	/// \code
2129	/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
2130	/// \endcode
2131	///
2132	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2133	/// instruction.
2134	///
2135	/// \param A
2136	/// A 128-bit integer vector containing one of the source operands to be
2137	/// compared.
2138	/// \param LA
2139	/// An integer that specifies the length of the string in \a A.
2140	/// \param B
2141	/// A 128-bit integer vector containing one of the source operands to be
2142	/// compared.
2143	/// \param LB
2144	/// An integer that specifies the length of the string in \a B.
2145	/// \param M
2146	/// An 8-bit immediate operand specifying whether the characters are bytes or
2147	/// words and the type of comparison to perform. \n
2148	/// Bits [1:0]: Determine source data format. \n
2149	/// 00: 16 unsigned bytes \n
2150	/// 01: 8 unsigned words \n
2151	/// 10: 16 signed bytes \n
2152	/// 11: 8 signed words \n
2153	/// Bits [3:2]: Determine comparison type and aggregation method. \n
2154	/// 00: Subset: Each character in \a B is compared for equality with all
2155	/// the characters in \a A. \n
2156	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2157	/// basis is greater than or equal for even-indexed elements in \a A,
2158	/// and less than or equal for odd-indexed elements in \a A. \n
2159	/// 10: Match: Compare each pair of corresponding characters in \a A and
2160	/// \a B for equality. \n
2161	/// 11: Substring: Search \a B for substring matches of \a A. \n
2162	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2163	/// mask of the comparison results. \n
2164	/// 00: No effect. \n
2165	/// 01: Negate the bit mask. \n
2166	/// 10: No effect. \n
2167	/// 11: Negate the bit mask only for bits with an index less than or equal
2168	/// to the size of \a A or \a B. \n
2169	/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
2170	#define _mm_cmpestrc(A, LA, B, LB, M) \
2171	(int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
2172	(__v16qi)(__m128i)(B), (int)(LB), \
2173	(int)(M))
2174
2175	/// Uses the immediate operand \a M to perform a comparison of string
2176	/// data with explicitly defined lengths that is contained in source operands
2177	/// \a A and \a B. Returns bit 0 of the resulting bit mask.
2178	///
2179	/// \headerfile <x86intrin.h>
2180	///
2181	/// \code
2182	/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
2183	/// \endcode
2184	///
2185	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2186	/// instruction.
2187	///
2188	/// \param A
2189	/// A 128-bit integer vector containing one of the source operands to be
2190	/// compared.
2191	/// \param LA
2192	/// An integer that specifies the length of the string in \a A.
2193	/// \param B
2194	/// A 128-bit integer vector containing one of the source operands to be
2195	/// compared.
2196	/// \param LB
2197	/// An integer that specifies the length of the string in \a B.
2198	/// \param M
2199	/// An 8-bit immediate operand specifying whether the characters are bytes or
2200	/// words and the type of comparison to perform. \n
2201	/// Bits [1:0]: Determine source data format. \n
2202	/// 00: 16 unsigned bytes \n
2203	/// 01: 8 unsigned words \n
2204	/// 10: 16 signed bytes \n
2205	/// 11: 8 signed words \n
2206	/// Bits [3:2]: Determine comparison type and aggregation method. \n
2207	/// 00: Subset: Each character in \a B is compared for equality with all
2208	/// the characters in \a A. \n
2209	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2210	/// basis is greater than or equal for even-indexed elements in \a A,
2211	/// and less than or equal for odd-indexed elements in \a A. \n
2212	/// 10: Match: Compare each pair of corresponding characters in \a A and
2213	/// \a B for equality. \n
2214	/// 11: Substring: Search \a B for substring matches of \a A. \n
2215	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2216	/// mask of the comparison results. \n
2217	/// 00: No effect. \n
2218	/// 01: Negate the bit mask. \n
2219	/// 10: No effect. \n
2220	/// 11: Negate the bit mask only for bits with an index less than or equal
2221	/// to the size of \a A or \a B.
2222	/// \returns Returns bit 0 of the resulting bit mask.
2223	#define _mm_cmpestro(A, LA, B, LB, M) \
2224	(int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
2225	(__v16qi)(__m128i)(B), (int)(LB), \
2226	(int)(M))
2227
2228	/// Uses the immediate operand \a M to perform a comparison of string
2229	/// data with explicitly defined lengths that is contained in source operands
2230	/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
2231	/// the maximum, otherwise, returns 0.
2232	///
2233	/// \headerfile <x86intrin.h>
2234	///
2235	/// \code
2236	/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
2237	/// \endcode
2238	///
2239	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2240	/// instruction.
2241	///
2242	/// \param A
2243	/// A 128-bit integer vector containing one of the source operands to be
2244	/// compared.
2245	/// \param LA
2246	/// An integer that specifies the length of the string in \a A.
2247	/// \param B
2248	/// A 128-bit integer vector containing one of the source operands to be
2249	/// compared.
2250	/// \param LB
2251	/// An integer that specifies the length of the string in \a B.
2252	/// \param M
2253	/// An 8-bit immediate operand specifying whether the characters are bytes or
2254	/// words and the type of comparison to perform. \n
2255	/// Bits [1:0]: Determine source data format. \n
2256	/// 00: 16 unsigned bytes \n
2257	/// 01: 8 unsigned words \n
2258	/// 10: 16 signed bytes \n
2259	/// 11: 8 signed words \n
2260	/// Bits [3:2]: Determine comparison type and aggregation method. \n
2261	/// 00: Subset: Each character in \a B is compared for equality with all
2262	/// the characters in \a A. \n
2263	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2264	/// basis is greater than or equal for even-indexed elements in \a A,
2265	/// and less than or equal for odd-indexed elements in \a A. \n
2266	/// 10: Match: Compare each pair of corresponding characters in \a A and
2267	/// \a B for equality. \n
2268	/// 11: Substring: Search \a B for substring matches of \a A. \n
2269	/// Bits [5:4]: Determine whether to perform a one's complement in the bit
2270	/// mask of the comparison results. \n
2271	/// 00: No effect. \n
2272	/// 01: Negate the bit mask. \n
2273	/// 10: No effect. \n
2274	/// 11: Negate the bit mask only for bits with an index less than or equal
2275	/// to the size of \a A or \a B. \n
2276	/// \returns Returns 1 if the length of the string in \a A is less than the
2277	/// maximum, otherwise, returns 0.
2278	#define _mm_cmpestrs(A, LA, B, LB, M) \
2279	(int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
2280	(__v16qi)(__m128i)(B), (int)(LB), \
2281	(int)(M))
2282
2283	/// Uses the immediate operand \a M to perform a comparison of string
2284	/// data with explicitly defined lengths that is contained in source operands
2285	/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
2286	/// the maximum, otherwise, returns 0.
2287	///
2288	/// \headerfile <x86intrin.h>
2289	///
2290	/// \code
2291	/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
2292	/// \endcode
2293	///
2294	/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
2295	///
2296	/// \param A
2297	/// A 128-bit integer vector containing one of the source operands to be
2298	/// compared.
2299	/// \param LA
2300	/// An integer that specifies the length of the string in \a A.
2301	/// \param B
2302	/// A 128-bit integer vector containing one of the source operands to be
2303	/// compared.
2304	/// \param LB
2305	/// An integer that specifies the length of the string in \a B.
2306	/// \param M
2307	/// An 8-bit immediate operand specifying whether the characters are bytes or
2308	/// words and the type of comparison to perform. \n
2309	/// Bits [1:0]: Determine source data format. \n
2310	/// 00: 16 unsigned bytes \n
2311	/// 01: 8 unsigned words \n
2312	/// 10: 16 signed bytes \n
2313	/// 11: 8 signed words \n
2314	/// Bits [3:2]: Determine comparison type and aggregation method. \n
2315	/// 00: Subset: Each character in \a B is compared for equality with all
2316	/// the characters in \a A. \n
2317	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2318	/// basis is greater than or equal for even-indexed elements in \a A,
2319	/// and less than or equal for odd-indexed elements in \a A. \n
2320	/// 10: Match: Compare each pair of corresponding characters in \a A and
2321	/// \a B for equality. \n
2322	/// 11: Substring: Search \a B for substring matches of \a A. \n
2323	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2324	/// mask of the comparison results. \n
2325	/// 00: No effect. \n
2326	/// 01: Negate the bit mask. \n
2327	/// 10: No effect. \n
2328	/// 11: Negate the bit mask only for bits with an index less than or equal
2329	/// to the size of \a A or \a B.
2330	/// \returns Returns 1 if the length of the string in \a B is less than the
2331	/// maximum, otherwise, returns 0.
2332	#define _mm_cmpestrz(A, LA, B, LB, M) \
2333	(int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
2334	(__v16qi)(__m128i)(B), (int)(LB), \
2335	(int)(M))
2336
2337	/* SSE4.2 Compare Packed Data -- Greater Than. */
2338	/// Compares each of the corresponding 64-bit values of the 128-bit
2339	/// integer vectors to determine if the values in the first operand are
2340	/// greater than those in the second operand.
2341	///
2342	/// \headerfile <x86intrin.h>
2343	///
2344	/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
2345	///
2346	/// \param __V1
2347	/// A 128-bit integer vector.
2348	/// \param __V2
2349	/// A 128-bit integer vector.
2350	/// \returns A 128-bit integer vector containing the comparison results.
2351	static __inline__ __m128i __DEFAULT_FN_ATTRS
2352	_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
2353	{
2354	return (__m128i)((__v2di)__V1 > (__v2di)__V2);
2355	}
2356
2357	/* SSE4.2 Accumulate CRC32. */
2358	/// Adds the unsigned integer operand to the CRC-32C checksum of the
2359	/// unsigned char operand.
2360	///
2361	/// \headerfile <x86intrin.h>
2362	///
2363	/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
2364	///
2365	/// \param __C
2366	/// An unsigned integer operand to add to the CRC-32C checksum of operand
2367	/// \a __D.
2368	/// \param __D
2369	/// An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
2370	/// \returns The result of adding operand \a __C to the CRC-32C checksum of
2371	/// operand \a __D.
2372	static __inline__ unsigned int __DEFAULT_FN_ATTRS
2373	_mm_crc32_u8(unsigned int __C, unsigned char __D)
2374	{
2375	return __builtin_ia32_crc32qi(__C, __D);
2376	}
2377
2378	/// Adds the unsigned integer operand to the CRC-32C checksum of the
2379	/// unsigned short operand.
2380	///
2381	/// \headerfile <x86intrin.h>
2382	///
2383	/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
2384	///
2385	/// \param __C
2386	/// An unsigned integer operand to add to the CRC-32C checksum of operand
2387	/// \a __D.
2388	/// \param __D
2389	/// An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
2390	/// \returns The result of adding operand \a __C to the CRC-32C checksum of
2391	/// operand \a __D.
2392	static __inline__ unsigned int __DEFAULT_FN_ATTRS
2393	_mm_crc32_u16(unsigned int __C, unsigned short __D)
2394	{
2395	return __builtin_ia32_crc32hi(__C, __D);
2396	}
2397
2398	/// Adds the first unsigned integer operand to the CRC-32C checksum of
2399	/// the second unsigned integer operand.
2400	///
2401	/// \headerfile <x86intrin.h>
2402	///
2403	/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
2404	///
2405	/// \param __C
2406	/// An unsigned integer operand to add to the CRC-32C checksum of operand
2407	/// \a __D.
2408	/// \param __D
2409	/// An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
2410	/// \returns The result of adding operand \a __C to the CRC-32C checksum of
2411	/// operand \a __D.
2412	static __inline__ unsigned int __DEFAULT_FN_ATTRS
2413	_mm_crc32_u32(unsigned int __C, unsigned int __D)
2414	{
2415	return __builtin_ia32_crc32si(__C, __D);
2416	}
2417
2418	#ifdef __x86_64__
2419	/// Adds the unsigned integer operand to the CRC-32C checksum of the
2420	/// unsigned 64-bit integer operand.
2421	///
2422	/// \headerfile <x86intrin.h>
2423	///
2424	/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
2425	///
2426	/// \param __C
2427	/// An unsigned integer operand to add to the CRC-32C checksum of operand
2428	/// \a __D.
2429	/// \param __D
2430	/// An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
2431	/// \returns The result of adding operand \a __C to the CRC-32C checksum of
2432	/// operand \a __D.
2433	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
2434	_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
2435	{
2436	return __builtin_ia32_crc32di(__C, __D);
2437	}
2438	#endif /* __x86_64__ */
2439
2440	#undef __DEFAULT_FN_ATTRS
2441
2442	#include <popcntintrin.h>
2443
2444	#endif /* __SMMINTRIN_H */
2445

Clang Project