xmmintrin.h source code [clang_source_code/lib/Headers/xmmintrin.h]

1	/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2	*
3	* Permission is hereby granted, free of charge, to any person obtaining a copy
4	* of this software and associated documentation files (the "Software"), to deal
5	* in the Software without restriction, including without limitation the rights
6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7	* copies of the Software, and to permit persons to whom the Software is
8	* furnished to do so, subject to the following conditions:
9	*
10	* The above copyright notice and this permission notice shall be included in
11	* all copies or substantial portions of the Software.
12	*
13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19	* THE SOFTWARE.
20	*
21	*===-----------------------------------------------------------------------===
22	*/
23
24	#ifndef __XMMINTRIN_H
25	#define __XMMINTRIN_H
26
27	#include <mmintrin.h>
28
29	typedef int __v4si __attribute__((__vector_size__(16)));
30	typedef float __v4sf __attribute__((__vector_size__(16)));
31	typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
32
33	typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
34
35	/* Unsigned types */
36	typedef unsigned int __v4su __attribute__((__vector_size__(16)));
37
38	/* This header should only be included in a hosted environment as it depends on
39	* a standard library to provide allocation routines. */
40	#if __STDC_HOSTED__
41	#include <mm_malloc.h>
42	#endif
43
44	/* Define the default attributes for the functions in this file. */
45	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
46	#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
47
48	/// Adds the 32-bit float values in the low-order bits of the operands.
49	///
50	/// \headerfile <x86intrin.h>
51	///
52	/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
53	///
54	/// \param __a
55	/// A 128-bit vector of [4 x float] containing one of the source operands.
56	/// The lower 32 bits of this operand are used in the calculation.
57	/// \param __b
58	/// A 128-bit vector of [4 x float] containing one of the source operands.
59	/// The lower 32 bits of this operand are used in the calculation.
60	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
61	/// of the lower 32 bits of both operands. The upper 96 bits are copied from
62	/// the upper 96 bits of the first source operand.
63	static __inline__ __m128 __DEFAULT_FN_ATTRS
64	_mm_add_ss(__m128 __a, __m128 __b)
65	{
66	__a[0] += __b[0];
67	return __a;
68	}
69
70	/// Adds two 128-bit vectors of [4 x float], and returns the results of
71	/// the addition.
72	///
73	/// \headerfile <x86intrin.h>
74	///
75	/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
76	///
77	/// \param __a
78	/// A 128-bit vector of [4 x float] containing one of the source operands.
79	/// \param __b
80	/// A 128-bit vector of [4 x float] containing one of the source operands.
81	/// \returns A 128-bit vector of [4 x float] containing the sums of both
82	/// operands.
83	static __inline__ __m128 __DEFAULT_FN_ATTRS
84	_mm_add_ps(__m128 __a, __m128 __b)
85	{
86	return (__m128)((__v4sf)__a + (__v4sf)__b);
87	}
88
89	/// Subtracts the 32-bit float value in the low-order bits of the second
90	/// operand from the corresponding value in the first operand.
91	///
92	/// \headerfile <x86intrin.h>
93	///
94	/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
95	///
96	/// \param __a
97	/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
98	/// of this operand are used in the calculation.
99	/// \param __b
100	/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
101	/// bits of this operand are used in the calculation.
102	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
103	/// difference of the lower 32 bits of both operands. The upper 96 bits are
104	/// copied from the upper 96 bits of the first source operand.
105	static __inline__ __m128 __DEFAULT_FN_ATTRS
106	_mm_sub_ss(__m128 __a, __m128 __b)
107	{
108	__a[0] -= __b[0];
109	return __a;
110	}
111
112	/// Subtracts each of the values of the second operand from the first
113	/// operand, both of which are 128-bit vectors of [4 x float] and returns
114	/// the results of the subtraction.
115	///
116	/// \headerfile <x86intrin.h>
117	///
118	/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
119	///
120	/// \param __a
121	/// A 128-bit vector of [4 x float] containing the minuend.
122	/// \param __b
123	/// A 128-bit vector of [4 x float] containing the subtrahend.
124	/// \returns A 128-bit vector of [4 x float] containing the differences between
125	/// both operands.
126	static __inline__ __m128 __DEFAULT_FN_ATTRS
127	_mm_sub_ps(__m128 __a, __m128 __b)
128	{
129	return (__m128)((__v4sf)__a - (__v4sf)__b);
130	}
131
132	/// Multiplies two 32-bit float values in the low-order bits of the
133	/// operands.
134	///
135	/// \headerfile <x86intrin.h>
136	///
137	/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
138	///
139	/// \param __a
140	/// A 128-bit vector of [4 x float] containing one of the source operands.
141	/// The lower 32 bits of this operand are used in the calculation.
142	/// \param __b
143	/// A 128-bit vector of [4 x float] containing one of the source operands.
144	/// The lower 32 bits of this operand are used in the calculation.
145	/// \returns A 128-bit vector of [4 x float] containing the product of the lower
146	/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
147	/// bits of the first source operand.
148	static __inline__ __m128 __DEFAULT_FN_ATTRS
149	_mm_mul_ss(__m128 __a, __m128 __b)
150	{
151	__a[0] *= __b[0];
152	return __a;
153	}
154
155	/// Multiplies two 128-bit vectors of [4 x float] and returns the
156	/// results of the multiplication.
157	///
158	/// \headerfile <x86intrin.h>
159	///
160	/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
161	///
162	/// \param __a
163	/// A 128-bit vector of [4 x float] containing one of the source operands.
164	/// \param __b
165	/// A 128-bit vector of [4 x float] containing one of the source operands.
166	/// \returns A 128-bit vector of [4 x float] containing the products of both
167	/// operands.
168	static __inline__ __m128 __DEFAULT_FN_ATTRS
169	_mm_mul_ps(__m128 __a, __m128 __b)
170	{
171	return (__m128)((__v4sf)__a * (__v4sf)__b);
172	}
173
174	/// Divides the value in the low-order 32 bits of the first operand by
175	/// the corresponding value in the second operand.
176	///
177	/// \headerfile <x86intrin.h>
178	///
179	/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
180	///
181	/// \param __a
182	/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
183	/// bits of this operand are used in the calculation.
184	/// \param __b
185	/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
186	/// of this operand are used in the calculation.
187	/// \returns A 128-bit vector of [4 x float] containing the quotients of the
188	/// lower 32 bits of both operands. The upper 96 bits are copied from the
189	/// upper 96 bits of the first source operand.
190	static __inline__ __m128 __DEFAULT_FN_ATTRS
191	_mm_div_ss(__m128 __a, __m128 __b)
192	{
193	__a[0] /= __b[0];
194	return __a;
195	}
196
197	/// Divides two 128-bit vectors of [4 x float].
198	///
199	/// \headerfile <x86intrin.h>
200	///
201	/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
202	///
203	/// \param __a
204	/// A 128-bit vector of [4 x float] containing the dividend.
205	/// \param __b
206	/// A 128-bit vector of [4 x float] containing the divisor.
207	/// \returns A 128-bit vector of [4 x float] containing the quotients of both
208	/// operands.
209	static __inline__ __m128 __DEFAULT_FN_ATTRS
210	_mm_div_ps(__m128 __a, __m128 __b)
211	{
212	return (__m128)((__v4sf)__a / (__v4sf)__b);
213	}
214
215	/// Calculates the square root of the value stored in the low-order bits
216	/// of a 128-bit vector of [4 x float].
217	///
218	/// \headerfile <x86intrin.h>
219	///
220	/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
221	///
222	/// \param __a
223	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
224	/// used in the calculation.
225	/// \returns A 128-bit vector of [4 x float] containing the square root of the
226	/// value in the low-order bits of the operand.
227	static __inline__ __m128 __DEFAULT_FN_ATTRS
228	_mm_sqrt_ss(__m128 __a)
229	{
230	return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
231	}
232
233	/// Calculates the square roots of the values stored in a 128-bit vector
234	/// of [4 x float].
235	///
236	/// \headerfile <x86intrin.h>
237	///
238	/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
239	///
240	/// \param __a
241	/// A 128-bit vector of [4 x float].
242	/// \returns A 128-bit vector of [4 x float] containing the square roots of the
243	/// values in the operand.
244	static __inline__ __m128 __DEFAULT_FN_ATTRS
245	_mm_sqrt_ps(__m128 __a)
246	{
247	return __builtin_ia32_sqrtps((__v4sf)__a);
248	}
249
250	/// Calculates the approximate reciprocal of the value stored in the
251	/// low-order bits of a 128-bit vector of [4 x float].
252	///
253	/// \headerfile <x86intrin.h>
254	///
255	/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
256	///
257	/// \param __a
258	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
259	/// used in the calculation.
260	/// \returns A 128-bit vector of [4 x float] containing the approximate
261	/// reciprocal of the value in the low-order bits of the operand.
262	static __inline__ __m128 __DEFAULT_FN_ATTRS
263	_mm_rcp_ss(__m128 __a)
264	{
265	return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
266	}
267
268	/// Calculates the approximate reciprocals of the values stored in a
269	/// 128-bit vector of [4 x float].
270	///
271	/// \headerfile <x86intrin.h>
272	///
273	/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
274	///
275	/// \param __a
276	/// A 128-bit vector of [4 x float].
277	/// \returns A 128-bit vector of [4 x float] containing the approximate
278	/// reciprocals of the values in the operand.
279	static __inline__ __m128 __DEFAULT_FN_ATTRS
280	_mm_rcp_ps(__m128 __a)
281	{
282	return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
283	}
284
285	/// Calculates the approximate reciprocal of the square root of the value
286	/// stored in the low-order bits of a 128-bit vector of [4 x float].
287	///
288	/// \headerfile <x86intrin.h>
289	///
290	/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
291	///
292	/// \param __a
293	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
294	/// used in the calculation.
295	/// \returns A 128-bit vector of [4 x float] containing the approximate
296	/// reciprocal of the square root of the value in the low-order bits of the
297	/// operand.
298	static __inline__ __m128 __DEFAULT_FN_ATTRS
299	_mm_rsqrt_ss(__m128 __a)
300	{
301	return __builtin_ia32_rsqrtss((__v4sf)__a);
302	}
303
304	/// Calculates the approximate reciprocals of the square roots of the
305	/// values stored in a 128-bit vector of [4 x float].
306	///
307	/// \headerfile <x86intrin.h>
308	///
309	/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
310	///
311	/// \param __a
312	/// A 128-bit vector of [4 x float].
313	/// \returns A 128-bit vector of [4 x float] containing the approximate
314	/// reciprocals of the square roots of the values in the operand.
315	static __inline__ __m128 __DEFAULT_FN_ATTRS
316	_mm_rsqrt_ps(__m128 __a)
317	{
318	return __builtin_ia32_rsqrtps((__v4sf)__a);
319	}
320
321	/// Compares two 32-bit float values in the low-order bits of both
322	/// operands and returns the lesser value in the low-order bits of the
323	/// vector of [4 x float].
324	///
325	/// \headerfile <x86intrin.h>
326	///
327	/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
328	///
329	/// \param __a
330	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
331	/// 32 bits of this operand are used in the comparison.
332	/// \param __b
333	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
334	/// 32 bits of this operand are used in the comparison.
335	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
336	/// minimum value between both operands. The upper 96 bits are copied from
337	/// the upper 96 bits of the first source operand.
338	static __inline__ __m128 __DEFAULT_FN_ATTRS
339	_mm_min_ss(__m128 __a, __m128 __b)
340	{
341	return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
342	}
343
344	/// Compares two 128-bit vectors of [4 x float] and returns the lesser
345	/// of each pair of values.
346	///
347	/// \headerfile <x86intrin.h>
348	///
349	/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
350	///
351	/// \param __a
352	/// A 128-bit vector of [4 x float] containing one of the operands.
353	/// \param __b
354	/// A 128-bit vector of [4 x float] containing one of the operands.
355	/// \returns A 128-bit vector of [4 x float] containing the minimum values
356	/// between both operands.
357	static __inline__ __m128 __DEFAULT_FN_ATTRS
358	_mm_min_ps(__m128 __a, __m128 __b)
359	{
360	return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
361	}
362
363	/// Compares two 32-bit float values in the low-order bits of both
364	/// operands and returns the greater value in the low-order bits of a 128-bit
365	/// vector of [4 x float].
366	///
367	/// \headerfile <x86intrin.h>
368	///
369	/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
370	///
371	/// \param __a
372	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
373	/// 32 bits of this operand are used in the comparison.
374	/// \param __b
375	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
376	/// 32 bits of this operand are used in the comparison.
377	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
378	/// maximum value between both operands. The upper 96 bits are copied from
379	/// the upper 96 bits of the first source operand.
380	static __inline__ __m128 __DEFAULT_FN_ATTRS
381	_mm_max_ss(__m128 __a, __m128 __b)
382	{
383	return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
384	}
385
386	/// Compares two 128-bit vectors of [4 x float] and returns the greater
387	/// of each pair of values.
388	///
389	/// \headerfile <x86intrin.h>
390	///
391	/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
392	///
393	/// \param __a
394	/// A 128-bit vector of [4 x float] containing one of the operands.
395	/// \param __b
396	/// A 128-bit vector of [4 x float] containing one of the operands.
397	/// \returns A 128-bit vector of [4 x float] containing the maximum values
398	/// between both operands.
399	static __inline__ __m128 __DEFAULT_FN_ATTRS
400	_mm_max_ps(__m128 __a, __m128 __b)
401	{
402	return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
403	}
404
405	/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
406	///
407	/// \headerfile <x86intrin.h>
408	///
409	/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
410	///
411	/// \param __a
412	/// A 128-bit vector containing one of the source operands.
413	/// \param __b
414	/// A 128-bit vector containing one of the source operands.
415	/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
416	/// values between both operands.
417	static __inline__ __m128 __DEFAULT_FN_ATTRS
418	_mm_and_ps(__m128 __a, __m128 __b)
419	{
420	return (__m128)((__v4su)__a & (__v4su)__b);
421	}
422
423	/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
424	/// the one's complement of the values contained in the first source
425	/// operand.
426	///
427	/// \headerfile <x86intrin.h>
428	///
429	/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
430	///
431	/// \param __a
432	/// A 128-bit vector of [4 x float] containing the first source operand. The
433	/// one's complement of this value is used in the bitwise AND.
434	/// \param __b
435	/// A 128-bit vector of [4 x float] containing the second source operand.
436	/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
437	/// one's complement of the first operand and the values in the second
438	/// operand.
439	static __inline__ __m128 __DEFAULT_FN_ATTRS
440	_mm_andnot_ps(__m128 __a, __m128 __b)
441	{
442	return (__m128)(~(__v4su)__a & (__v4su)__b);
443	}
444
445	/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
446	///
447	/// \headerfile <x86intrin.h>
448	///
449	/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
450	///
451	/// \param __a
452	/// A 128-bit vector of [4 x float] containing one of the source operands.
453	/// \param __b
454	/// A 128-bit vector of [4 x float] containing one of the source operands.
455	/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
456	/// values between both operands.
457	static __inline__ __m128 __DEFAULT_FN_ATTRS
458	_mm_or_ps(__m128 __a, __m128 __b)
459	{
460	return (__m128)((__v4su)__a \| (__v4su)__b);
461	}
462
463	/// Performs a bitwise exclusive OR of two 128-bit vectors of
464	/// [4 x float].
465	///
466	/// \headerfile <x86intrin.h>
467	///
468	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
469	///
470	/// \param __a
471	/// A 128-bit vector of [4 x float] containing one of the source operands.
472	/// \param __b
473	/// A 128-bit vector of [4 x float] containing one of the source operands.
474	/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
475	/// of the values between both operands.
476	static __inline__ __m128 __DEFAULT_FN_ATTRS
477	_mm_xor_ps(__m128 __a, __m128 __b)
478	{
479	return (__m128)((__v4su)__a ^ (__v4su)__b);
480	}
481
482	/// Compares two 32-bit float values in the low-order bits of both
483	/// operands for equality and returns the result of the comparison in the
484	/// low-order bits of a vector [4 x float].
485	///
486	/// \headerfile <x86intrin.h>
487	///
488	/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
489	///
490	/// \param __a
491	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
492	/// 32 bits of this operand are used in the comparison.
493	/// \param __b
494	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
495	/// 32 bits of this operand are used in the comparison.
496	/// \returns A 128-bit vector of [4 x float] containing the comparison results
497	/// in the low-order bits.
498	static __inline__ __m128 __DEFAULT_FN_ATTRS
499	_mm_cmpeq_ss(__m128 __a, __m128 __b)
500	{
501	return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
502	}
503
504	/// Compares each of the corresponding 32-bit float values of the
505	/// 128-bit vectors of [4 x float] for equality.
506	///
507	/// \headerfile <x86intrin.h>
508	///
509	/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
510	///
511	/// \param __a
512	/// A 128-bit vector of [4 x float].
513	/// \param __b
514	/// A 128-bit vector of [4 x float].
515	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
516	static __inline__ __m128 __DEFAULT_FN_ATTRS
517	_mm_cmpeq_ps(__m128 __a, __m128 __b)
518	{
519	return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
520	}
521
522	/// Compares two 32-bit float values in the low-order bits of both
523	/// operands to determine if the value in the first operand is less than the
524	/// corresponding value in the second operand and returns the result of the
525	/// comparison in the low-order bits of a vector of [4 x float].
526	///
527	/// \headerfile <x86intrin.h>
528	///
529	/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
530	///
531	/// \param __a
532	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
533	/// 32 bits of this operand are used in the comparison.
534	/// \param __b
535	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
536	/// 32 bits of this operand are used in the comparison.
537	/// \returns A 128-bit vector of [4 x float] containing the comparison results
538	/// in the low-order bits.
539	static __inline__ __m128 __DEFAULT_FN_ATTRS
540	_mm_cmplt_ss(__m128 __a, __m128 __b)
541	{
542	return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
543	}
544
545	/// Compares each of the corresponding 32-bit float values of the
546	/// 128-bit vectors of [4 x float] to determine if the values in the first
547	/// operand are less than those in the second operand.
548	///
549	/// \headerfile <x86intrin.h>
550	///
551	/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
552	///
553	/// \param __a
554	/// A 128-bit vector of [4 x float].
555	/// \param __b
556	/// A 128-bit vector of [4 x float].
557	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
558	static __inline__ __m128 __DEFAULT_FN_ATTRS
559	_mm_cmplt_ps(__m128 __a, __m128 __b)
560	{
561	return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
562	}
563
564	/// Compares two 32-bit float values in the low-order bits of both
565	/// operands to determine if the value in the first operand is less than or
566	/// equal to the corresponding value in the second operand and returns the
567	/// result of the comparison in the low-order bits of a vector of
568	/// [4 x float].
569	///
570	/// \headerfile <x86intrin.h>
571	///
572	/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
573	///
574	/// \param __a
575	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
576	/// 32 bits of this operand are used in the comparison.
577	/// \param __b
578	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
579	/// 32 bits of this operand are used in the comparison.
580	/// \returns A 128-bit vector of [4 x float] containing the comparison results
581	/// in the low-order bits.
582	static __inline__ __m128 __DEFAULT_FN_ATTRS
583	_mm_cmple_ss(__m128 __a, __m128 __b)
584	{
585	return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
586	}
587
588	/// Compares each of the corresponding 32-bit float values of the
589	/// 128-bit vectors of [4 x float] to determine if the values in the first
590	/// operand are less than or equal to those in the second operand.
591	///
592	/// \headerfile <x86intrin.h>
593	///
594	/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
595	///
596	/// \param __a
597	/// A 128-bit vector of [4 x float].
598	/// \param __b
599	/// A 128-bit vector of [4 x float].
600	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
601	static __inline__ __m128 __DEFAULT_FN_ATTRS
602	_mm_cmple_ps(__m128 __a, __m128 __b)
603	{
604	return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
605	}
606
607	/// Compares two 32-bit float values in the low-order bits of both
608	/// operands to determine if the value in the first operand is greater than
609	/// the corresponding value in the second operand and returns the result of
610	/// the comparison in the low-order bits of a vector of [4 x float].
611	///
612	/// \headerfile <x86intrin.h>
613	///
614	/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
615	///
616	/// \param __a
617	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
618	/// 32 bits of this operand are used in the comparison.
619	/// \param __b
620	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
621	/// 32 bits of this operand are used in the comparison.
622	/// \returns A 128-bit vector of [4 x float] containing the comparison results
623	/// in the low-order bits.
624	static __inline__ __m128 __DEFAULT_FN_ATTRS
625	_mm_cmpgt_ss(__m128 __a, __m128 __b)
626	{
627	return (__m128)__builtin_shufflevector((__v4sf)__a,
628	(__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
629	4, 1, 2, 3);
630	}
631
632	/// Compares each of the corresponding 32-bit float values of the
633	/// 128-bit vectors of [4 x float] to determine if the values in the first
634	/// operand are greater than those in the second operand.
635	///
636	/// \headerfile <x86intrin.h>
637	///
638	/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
639	///
640	/// \param __a
641	/// A 128-bit vector of [4 x float].
642	/// \param __b
643	/// A 128-bit vector of [4 x float].
644	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
645	static __inline__ __m128 __DEFAULT_FN_ATTRS
646	_mm_cmpgt_ps(__m128 __a, __m128 __b)
647	{
648	return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
649	}
650
651	/// Compares two 32-bit float values in the low-order bits of both
652	/// operands to determine if the value in the first operand is greater than
653	/// or equal to the corresponding value in the second operand and returns
654	/// the result of the comparison in the low-order bits of a vector of
655	/// [4 x float].
656	///
657	/// \headerfile <x86intrin.h>
658	///
659	/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
660	///
661	/// \param __a
662	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
663	/// 32 bits of this operand are used in the comparison.
664	/// \param __b
665	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
666	/// 32 bits of this operand are used in the comparison.
667	/// \returns A 128-bit vector of [4 x float] containing the comparison results
668	/// in the low-order bits.
669	static __inline__ __m128 __DEFAULT_FN_ATTRS
670	_mm_cmpge_ss(__m128 __a, __m128 __b)
671	{
672	return (__m128)__builtin_shufflevector((__v4sf)__a,
673	(__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
674	4, 1, 2, 3);
675	}
676
677	/// Compares each of the corresponding 32-bit float values of the
678	/// 128-bit vectors of [4 x float] to determine if the values in the first
679	/// operand are greater than or equal to those in the second operand.
680	///
681	/// \headerfile <x86intrin.h>
682	///
683	/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
684	///
685	/// \param __a
686	/// A 128-bit vector of [4 x float].
687	/// \param __b
688	/// A 128-bit vector of [4 x float].
689	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
690	static __inline__ __m128 __DEFAULT_FN_ATTRS
691	_mm_cmpge_ps(__m128 __a, __m128 __b)
692	{
693	return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
694	}
695
696	/// Compares two 32-bit float values in the low-order bits of both
697	/// operands for inequality and returns the result of the comparison in the
698	/// low-order bits of a vector of [4 x float].
699	///
700	/// \headerfile <x86intrin.h>
701	///
702	/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
703	/// instructions.
704	///
705	/// \param __a
706	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
707	/// 32 bits of this operand are used in the comparison.
708	/// \param __b
709	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
710	/// 32 bits of this operand are used in the comparison.
711	/// \returns A 128-bit vector of [4 x float] containing the comparison results
712	/// in the low-order bits.
713	static __inline__ __m128 __DEFAULT_FN_ATTRS
714	_mm_cmpneq_ss(__m128 __a, __m128 __b)
715	{
716	return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
717	}
718
719	/// Compares each of the corresponding 32-bit float values of the
720	/// 128-bit vectors of [4 x float] for inequality.
721	///
722	/// \headerfile <x86intrin.h>
723	///
724	/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
725	/// instructions.
726	///
727	/// \param __a
728	/// A 128-bit vector of [4 x float].
729	/// \param __b
730	/// A 128-bit vector of [4 x float].
731	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
732	static __inline__ __m128 __DEFAULT_FN_ATTRS
733	_mm_cmpneq_ps(__m128 __a, __m128 __b)
734	{
735	return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
736	}
737
738	/// Compares two 32-bit float values in the low-order bits of both
739	/// operands to determine if the value in the first operand is not less than
740	/// the corresponding value in the second operand and returns the result of
741	/// the comparison in the low-order bits of a vector of [4 x float].
742	///
743	/// \headerfile <x86intrin.h>
744	///
745	/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
746	/// instructions.
747	///
748	/// \param __a
749	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
750	/// 32 bits of this operand are used in the comparison.
751	/// \param __b
752	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
753	/// 32 bits of this operand are used in the comparison.
754	/// \returns A 128-bit vector of [4 x float] containing the comparison results
755	/// in the low-order bits.
756	static __inline__ __m128 __DEFAULT_FN_ATTRS
757	_mm_cmpnlt_ss(__m128 __a, __m128 __b)
758	{
759	return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
760	}
761
762	/// Compares each of the corresponding 32-bit float values of the
763	/// 128-bit vectors of [4 x float] to determine if the values in the first
764	/// operand are not less than those in the second operand.
765	///
766	/// \headerfile <x86intrin.h>
767	///
768	/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
769	/// instructions.
770	///
771	/// \param __a
772	/// A 128-bit vector of [4 x float].
773	/// \param __b
774	/// A 128-bit vector of [4 x float].
775	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
776	static __inline__ __m128 __DEFAULT_FN_ATTRS
777	_mm_cmpnlt_ps(__m128 __a, __m128 __b)
778	{
779	return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
780	}
781
782	/// Compares two 32-bit float values in the low-order bits of both
783	/// operands to determine if the value in the first operand is not less than
784	/// or equal to the corresponding value in the second operand and returns
785	/// the result of the comparison in the low-order bits of a vector of
786	/// [4 x float].
787	///
788	/// \headerfile <x86intrin.h>
789	///
790	/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
791	/// instructions.
792	///
793	/// \param __a
794	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
795	/// 32 bits of this operand are used in the comparison.
796	/// \param __b
797	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
798	/// 32 bits of this operand are used in the comparison.
799	/// \returns A 128-bit vector of [4 x float] containing the comparison results
800	/// in the low-order bits.
801	static __inline__ __m128 __DEFAULT_FN_ATTRS
802	_mm_cmpnle_ss(__m128 __a, __m128 __b)
803	{
804	return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
805	}
806
807	/// Compares each of the corresponding 32-bit float values of the
808	/// 128-bit vectors of [4 x float] to determine if the values in the first
809	/// operand are not less than or equal to those in the second operand.
810	///
811	/// \headerfile <x86intrin.h>
812	///
813	/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
814	/// instructions.
815	///
816	/// \param __a
817	/// A 128-bit vector of [4 x float].
818	/// \param __b
819	/// A 128-bit vector of [4 x float].
820	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
821	static __inline__ __m128 __DEFAULT_FN_ATTRS
822	_mm_cmpnle_ps(__m128 __a, __m128 __b)
823	{
824	return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
825	}
826
827	/// Compares two 32-bit float values in the low-order bits of both
828	/// operands to determine if the value in the first operand is not greater
829	/// than the corresponding value in the second operand and returns the
830	/// result of the comparison in the low-order bits of a vector of
831	/// [4 x float].
832	///
833	/// \headerfile <x86intrin.h>
834	///
835	/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
836	/// instructions.
837	///
838	/// \param __a
839	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
840	/// 32 bits of this operand are used in the comparison.
841	/// \param __b
842	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
843	/// 32 bits of this operand are used in the comparison.
844	/// \returns A 128-bit vector of [4 x float] containing the comparison results
845	/// in the low-order bits.
846	static __inline__ __m128 __DEFAULT_FN_ATTRS
847	_mm_cmpngt_ss(__m128 __a, __m128 __b)
848	{
849	return (__m128)__builtin_shufflevector((__v4sf)__a,
850	(__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
851	4, 1, 2, 3);
852	}
853
854	/// Compares each of the corresponding 32-bit float values of the
855	/// 128-bit vectors of [4 x float] to determine if the values in the first
856	/// operand are not greater than those in the second operand.
857	///
858	/// \headerfile <x86intrin.h>
859	///
860	/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
861	/// instructions.
862	///
863	/// \param __a
864	/// A 128-bit vector of [4 x float].
865	/// \param __b
866	/// A 128-bit vector of [4 x float].
867	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
868	static __inline__ __m128 __DEFAULT_FN_ATTRS
869	_mm_cmpngt_ps(__m128 __a, __m128 __b)
870	{
871	return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
872	}
873
874	/// Compares two 32-bit float values in the low-order bits of both
875	/// operands to determine if the value in the first operand is not greater
876	/// than or equal to the corresponding value in the second operand and
877	/// returns the result of the comparison in the low-order bits of a vector
878	/// of [4 x float].
879	///
880	/// \headerfile <x86intrin.h>
881	///
882	/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
883	/// instructions.
884	///
885	/// \param __a
886	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
887	/// 32 bits of this operand are used in the comparison.
888	/// \param __b
889	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
890	/// 32 bits of this operand are used in the comparison.
891	/// \returns A 128-bit vector of [4 x float] containing the comparison results
892	/// in the low-order bits.
893	static __inline__ __m128 __DEFAULT_FN_ATTRS
894	_mm_cmpnge_ss(__m128 __a, __m128 __b)
895	{
896	return (__m128)__builtin_shufflevector((__v4sf)__a,
897	(__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
898	4, 1, 2, 3);
899	}
900
901	/// Compares each of the corresponding 32-bit float values of the
902	/// 128-bit vectors of [4 x float] to determine if the values in the first
903	/// operand are not greater than or equal to those in the second operand.
904	///
905	/// \headerfile <x86intrin.h>
906	///
907	/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
908	/// instructions.
909	///
910	/// \param __a
911	/// A 128-bit vector of [4 x float].
912	/// \param __b
913	/// A 128-bit vector of [4 x float].
914	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
915	static __inline__ __m128 __DEFAULT_FN_ATTRS
916	_mm_cmpnge_ps(__m128 __a, __m128 __b)
917	{
918	return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
919	}
920
921	/// Compares two 32-bit float values in the low-order bits of both
922	/// operands to determine if the value in the first operand is ordered with
923	/// respect to the corresponding value in the second operand and returns the
924	/// result of the comparison in the low-order bits of a vector of
925	/// [4 x float].
926	///
927	/// \headerfile <x86intrin.h>
928	///
929	/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
930	/// instructions.
931	///
932	/// \param __a
933	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
934	/// 32 bits of this operand are used in the comparison.
935	/// \param __b
936	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
937	/// 32 bits of this operand are used in the comparison.
938	/// \returns A 128-bit vector of [4 x float] containing the comparison results
939	/// in the low-order bits.
940	static __inline__ __m128 __DEFAULT_FN_ATTRS
941	_mm_cmpord_ss(__m128 __a, __m128 __b)
942	{
943	return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
944	}
945
946	/// Compares each of the corresponding 32-bit float values of the
947	/// 128-bit vectors of [4 x float] to determine if the values in the first
948	/// operand are ordered with respect to those in the second operand.
949	///
950	/// \headerfile <x86intrin.h>
951	///
952	/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
953	/// instructions.
954	///
955	/// \param __a
956	/// A 128-bit vector of [4 x float].
957	/// \param __b
958	/// A 128-bit vector of [4 x float].
959	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
960	static __inline__ __m128 __DEFAULT_FN_ATTRS
961	_mm_cmpord_ps(__m128 __a, __m128 __b)
962	{
963	return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
964	}
965
966	/// Compares two 32-bit float values in the low-order bits of both
967	/// operands to determine if the value in the first operand is unordered
968	/// with respect to the corresponding value in the second operand and
969	/// returns the result of the comparison in the low-order bits of a vector
970	/// of [4 x float].
971	///
972	/// \headerfile <x86intrin.h>
973	///
974	/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
975	/// instructions.
976	///
977	/// \param __a
978	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
979	/// 32 bits of this operand are used in the comparison.
980	/// \param __b
981	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
982	/// 32 bits of this operand are used in the comparison.
983	/// \returns A 128-bit vector of [4 x float] containing the comparison results
984	/// in the low-order bits.
985	static __inline__ __m128 __DEFAULT_FN_ATTRS
986	_mm_cmpunord_ss(__m128 __a, __m128 __b)
987	{
988	return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
989	}
990
991	/// Compares each of the corresponding 32-bit float values of the
992	/// 128-bit vectors of [4 x float] to determine if the values in the first
993	/// operand are unordered with respect to those in the second operand.
994	///
995	/// \headerfile <x86intrin.h>
996	///
997	/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
998	/// instructions.
999	///
1000	/// \param __a
1001	/// A 128-bit vector of [4 x float].
1002	/// \param __b
1003	/// A 128-bit vector of [4 x float].
1004	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1005	static __inline__ __m128 __DEFAULT_FN_ATTRS
1006	_mm_cmpunord_ps(__m128 __a, __m128 __b)
1007	{
1008	return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1009	}
1010
1011	/// Compares two 32-bit float values in the low-order bits of both
1012	/// operands for equality and returns the result of the comparison.
1013	///
1014	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1015	///
1016	/// \headerfile <x86intrin.h>
1017	///
1018	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1019	/// instructions.
1020	///
1021	/// \param __a
1022	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1023	/// used in the comparison.
1024	/// \param __b
1025	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1026	/// used in the comparison.
1027	/// \returns An integer containing the comparison results. If either of the
1028	/// two lower 32-bit values is NaN, 0 is returned.
1029	static __inline__ int __DEFAULT_FN_ATTRS
1030	_mm_comieq_ss(__m128 __a, __m128 __b)
1031	{
1032	return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1033	}
1034
1035	/// Compares two 32-bit float values in the low-order bits of both
1036	/// operands to determine if the first operand is less than the second
1037	/// operand and returns the result of the comparison.
1038	///
1039	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1040	///
1041	/// \headerfile <x86intrin.h>
1042	///
1043	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1044	/// instructions.
1045	///
1046	/// \param __a
1047	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1048	/// used in the comparison.
1049	/// \param __b
1050	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1051	/// used in the comparison.
1052	/// \returns An integer containing the comparison results. If either of the two
1053	/// lower 32-bit values is NaN, 0 is returned.
1054	static __inline__ int __DEFAULT_FN_ATTRS
1055	_mm_comilt_ss(__m128 __a, __m128 __b)
1056	{
1057	return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1058	}
1059
1060	/// Compares two 32-bit float values in the low-order bits of both
1061	/// operands to determine if the first operand is less than or equal to the
1062	/// second operand and returns the result of the comparison.
1063	///
1064	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1065	///
1066	/// \headerfile <x86intrin.h>
1067	///
1068	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1069	///
1070	/// \param __a
1071	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1072	/// used in the comparison.
1073	/// \param __b
1074	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1075	/// used in the comparison.
1076	/// \returns An integer containing the comparison results. If either of the two
1077	/// lower 32-bit values is NaN, 0 is returned.
1078	static __inline__ int __DEFAULT_FN_ATTRS
1079	_mm_comile_ss(__m128 __a, __m128 __b)
1080	{
1081	return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1082	}
1083
1084	/// Compares two 32-bit float values in the low-order bits of both
1085	/// operands to determine if the first operand is greater than the second
1086	/// operand and returns the result of the comparison.
1087	///
1088	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1089	///
1090	/// \headerfile <x86intrin.h>
1091	///
1092	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1093	///
1094	/// \param __a
1095	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1096	/// used in the comparison.
1097	/// \param __b
1098	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1099	/// used in the comparison.
1100	/// \returns An integer containing the comparison results. If either of the
1101	/// two lower 32-bit values is NaN, 0 is returned.
1102	static __inline__ int __DEFAULT_FN_ATTRS
1103	_mm_comigt_ss(__m128 __a, __m128 __b)
1104	{
1105	return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1106	}
1107
1108	/// Compares two 32-bit float values in the low-order bits of both
1109	/// operands to determine if the first operand is greater than or equal to
1110	/// the second operand and returns the result of the comparison.
1111	///
1112	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1113	///
1114	/// \headerfile <x86intrin.h>
1115	///
1116	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1117	///
1118	/// \param __a
1119	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1120	/// used in the comparison.
1121	/// \param __b
1122	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1123	/// used in the comparison.
1124	/// \returns An integer containing the comparison results. If either of the two
1125	/// lower 32-bit values is NaN, 0 is returned.
1126	static __inline__ int __DEFAULT_FN_ATTRS
1127	_mm_comige_ss(__m128 __a, __m128 __b)
1128	{
1129	return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1130	}
1131
1132	/// Compares two 32-bit float values in the low-order bits of both
1133	/// operands to determine if the first operand is not equal to the second
1134	/// operand and returns the result of the comparison.
1135	///
1136	/// If either of the two lower 32-bit values is NaN, 1 is returned.
1137	///
1138	/// \headerfile <x86intrin.h>
1139	///
1140	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1141	///
1142	/// \param __a
1143	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1144	/// used in the comparison.
1145	/// \param __b
1146	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1147	/// used in the comparison.
1148	/// \returns An integer containing the comparison results. If either of the
1149	/// two lower 32-bit values is NaN, 1 is returned.
1150	static __inline__ int __DEFAULT_FN_ATTRS
1151	_mm_comineq_ss(__m128 __a, __m128 __b)
1152	{
1153	return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1154	}
1155
1156	/// Performs an unordered comparison of two 32-bit float values using
1157	/// the low-order bits of both operands to determine equality and returns
1158	/// the result of the comparison.
1159	///
1160	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1161	///
1162	/// \headerfile <x86intrin.h>
1163	///
1164	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1165	///
1166	/// \param __a
1167	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1168	/// used in the comparison.
1169	/// \param __b
1170	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1171	/// used in the comparison.
1172	/// \returns An integer containing the comparison results. If either of the two
1173	/// lower 32-bit values is NaN, 0 is returned.
1174	static __inline__ int __DEFAULT_FN_ATTRS
1175	_mm_ucomieq_ss(__m128 __a, __m128 __b)
1176	{
1177	return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1178	}
1179
1180	/// Performs an unordered comparison of two 32-bit float values using
1181	/// the low-order bits of both operands to determine if the first operand is
1182	/// less than the second operand and returns the result of the comparison.
1183	///
1184	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1185	///
1186	/// \headerfile <x86intrin.h>
1187	///
1188	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1189	///
1190	/// \param __a
1191	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1192	/// used in the comparison.
1193	/// \param __b
1194	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1195	/// used in the comparison.
1196	/// \returns An integer containing the comparison results. If either of the two
1197	/// lower 32-bit values is NaN, 0 is returned.
1198	static __inline__ int __DEFAULT_FN_ATTRS
1199	_mm_ucomilt_ss(__m128 __a, __m128 __b)
1200	{
1201	return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1202	}
1203
1204	/// Performs an unordered comparison of two 32-bit float values using
1205	/// the low-order bits of both operands to determine if the first operand is
1206	/// less than or equal to the second operand and returns the result of the
1207	/// comparison.
1208	///
1209	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1210	///
1211	/// \headerfile <x86intrin.h>
1212	///
1213	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1214	///
1215	/// \param __a
1216	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1217	/// used in the comparison.
1218	/// \param __b
1219	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1220	/// used in the comparison.
1221	/// \returns An integer containing the comparison results. If either of the two
1222	/// lower 32-bit values is NaN, 0 is returned.
1223	static __inline__ int __DEFAULT_FN_ATTRS
1224	_mm_ucomile_ss(__m128 __a, __m128 __b)
1225	{
1226	return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1227	}
1228
1229	/// Performs an unordered comparison of two 32-bit float values using
1230	/// the low-order bits of both operands to determine if the first operand is
1231	/// greater than the second operand and returns the result of the
1232	/// comparison.
1233	///
1234	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1235	///
1236	/// \headerfile <x86intrin.h>
1237	///
1238	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1239	///
1240	/// \param __a
1241	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1242	/// used in the comparison.
1243	/// \param __b
1244	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1245	/// used in the comparison.
1246	/// \returns An integer containing the comparison results. If either of the two
1247	/// lower 32-bit values is NaN, 0 is returned.
1248	static __inline__ int __DEFAULT_FN_ATTRS
1249	_mm_ucomigt_ss(__m128 __a, __m128 __b)
1250	{
1251	return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1252	}
1253
1254	/// Performs an unordered comparison of two 32-bit float values using
1255	/// the low-order bits of both operands to determine if the first operand is
1256	/// greater than or equal to the second operand and returns the result of
1257	/// the comparison.
1258	///
1259	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1260	///
1261	/// \headerfile <x86intrin.h>
1262	///
1263	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1264	///
1265	/// \param __a
1266	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1267	/// used in the comparison.
1268	/// \param __b
1269	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1270	/// used in the comparison.
1271	/// \returns An integer containing the comparison results. If either of the two
1272	/// lower 32-bit values is NaN, 0 is returned.
1273	static __inline__ int __DEFAULT_FN_ATTRS
1274	_mm_ucomige_ss(__m128 __a, __m128 __b)
1275	{
1276	return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1277	}
1278
1279	/// Performs an unordered comparison of two 32-bit float values using
1280	/// the low-order bits of both operands to determine inequality and returns
1281	/// the result of the comparison.
1282	///
1283	/// If either of the two lower 32-bit values is NaN, 1 is returned.
1284	///
1285	/// \headerfile <x86intrin.h>
1286	///
1287	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1288	///
1289	/// \param __a
1290	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1291	/// used in the comparison.
1292	/// \param __b
1293	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1294	/// used in the comparison.
1295	/// \returns An integer containing the comparison results. If either of the two
1296	/// lower 32-bit values is NaN, 1 is returned.
1297	static __inline__ int __DEFAULT_FN_ATTRS
1298	_mm_ucomineq_ss(__m128 __a, __m128 __b)
1299	{
1300	return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1301	}
1302
1303	/// Converts a float value contained in the lower 32 bits of a vector of
1304	/// [4 x float] into a 32-bit integer.
1305	///
1306	/// \headerfile <x86intrin.h>
1307	///
1308	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1309	/// instructions.
1310	///
1311	/// \param __a
1312	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1313	/// used in the conversion.
1314	/// \returns A 32-bit integer containing the converted value.
1315	static __inline__ int __DEFAULT_FN_ATTRS
1316	_mm_cvtss_si32(__m128 __a)
1317	{
1318	return __builtin_ia32_cvtss2si((__v4sf)__a);
1319	}
1320
1321	/// Converts a float value contained in the lower 32 bits of a vector of
1322	/// [4 x float] into a 32-bit integer.
1323	///
1324	/// \headerfile <x86intrin.h>
1325	///
1326	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1327	/// instructions.
1328	///
1329	/// \param __a
1330	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1331	/// used in the conversion.
1332	/// \returns A 32-bit integer containing the converted value.
1333	static __inline__ int __DEFAULT_FN_ATTRS
1334	_mm_cvt_ss2si(__m128 __a)
1335	{
1336	return _mm_cvtss_si32(__a);
1337	}
1338
1339	#ifdef __x86_64__
1340
1341	/// Converts a float value contained in the lower 32 bits of a vector of
1342	/// [4 x float] into a 64-bit integer.
1343	///
1344	/// \headerfile <x86intrin.h>
1345	///
1346	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1347	/// instructions.
1348	///
1349	/// \param __a
1350	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1351	/// used in the conversion.
1352	/// \returns A 64-bit integer containing the converted value.
1353	static __inline__ long long __DEFAULT_FN_ATTRS
1354	_mm_cvtss_si64(__m128 __a)
1355	{
1356	return __builtin_ia32_cvtss2si64((__v4sf)__a);
1357	}
1358
1359	#endif
1360
1361	/// Converts two low-order float values in a 128-bit vector of
1362	/// [4 x float] into a 64-bit vector of [2 x i32].
1363	///
1364	/// \headerfile <x86intrin.h>
1365	///
1366	/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1367	///
1368	/// \param __a
1369	/// A 128-bit vector of [4 x float].
1370	/// \returns A 64-bit integer vector containing the converted values.
1371	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1372	_mm_cvtps_pi32(__m128 __a)
1373	{
1374	return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1375	}
1376
1377	/// Converts two low-order float values in a 128-bit vector of
1378	/// [4 x float] into a 64-bit vector of [2 x i32].
1379	///
1380	/// \headerfile <x86intrin.h>
1381	///
1382	/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1383	///
1384	/// \param __a
1385	/// A 128-bit vector of [4 x float].
1386	/// \returns A 64-bit integer vector containing the converted values.
1387	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1388	_mm_cvt_ps2pi(__m128 __a)
1389	{
1390	return _mm_cvtps_pi32(__a);
1391	}
1392
1393	/// Converts a float value contained in the lower 32 bits of a vector of
1394	/// [4 x float] into a 32-bit integer, truncating the result when it is
1395	/// inexact.
1396	///
1397	/// \headerfile <x86intrin.h>
1398	///
1399	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1400	/// instructions.
1401	///
1402	/// \param __a
1403	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1404	/// used in the conversion.
1405	/// \returns A 32-bit integer containing the converted value.
1406	static __inline__ int __DEFAULT_FN_ATTRS
1407	_mm_cvttss_si32(__m128 __a)
1408	{
1409	return __builtin_ia32_cvttss2si((__v4sf)__a);
1410	}
1411
1412	/// Converts a float value contained in the lower 32 bits of a vector of
1413	/// [4 x float] into a 32-bit integer, truncating the result when it is
1414	/// inexact.
1415	///
1416	/// \headerfile <x86intrin.h>
1417	///
1418	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1419	/// instructions.
1420	///
1421	/// \param __a
1422	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1423	/// used in the conversion.
1424	/// \returns A 32-bit integer containing the converted value.
1425	static __inline__ int __DEFAULT_FN_ATTRS
1426	_mm_cvtt_ss2si(__m128 __a)
1427	{
1428	return _mm_cvttss_si32(__a);
1429	}
1430
1431	#ifdef __x86_64__
1432	/// Converts a float value contained in the lower 32 bits of a vector of
1433	/// [4 x float] into a 64-bit integer, truncating the result when it is
1434	/// inexact.
1435	///
1436	/// \headerfile <x86intrin.h>
1437	///
1438	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1439	/// instructions.
1440	///
1441	/// \param __a
1442	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1443	/// used in the conversion.
1444	/// \returns A 64-bit integer containing the converted value.
1445	static __inline__ long long __DEFAULT_FN_ATTRS
1446	_mm_cvttss_si64(__m128 __a)
1447	{
1448	return __builtin_ia32_cvttss2si64((__v4sf)__a);
1449	}
1450	#endif
1451
1452	/// Converts two low-order float values in a 128-bit vector of
1453	/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1454	/// when it is inexact.
1455	///
1456	/// \headerfile <x86intrin.h>
1457	///
1458	/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1459	/// instructions.
1460	///
1461	/// \param __a
1462	/// A 128-bit vector of [4 x float].
1463	/// \returns A 64-bit integer vector containing the converted values.
1464	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1465	_mm_cvttps_pi32(__m128 __a)
1466	{
1467	return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1468	}
1469
1470	/// Converts two low-order float values in a 128-bit vector of [4 x
1471	/// float] into a 64-bit vector of [2 x i32], truncating the result when it
1472	/// is inexact.
1473	///
1474	/// \headerfile <x86intrin.h>
1475	///
1476	/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1477	///
1478	/// \param __a
1479	/// A 128-bit vector of [4 x float].
1480	/// \returns A 64-bit integer vector containing the converted values.
1481	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1482	_mm_cvtt_ps2pi(__m128 __a)
1483	{
1484	return _mm_cvttps_pi32(__a);
1485	}
1486
1487	/// Converts a 32-bit signed integer value into a floating point value
1488	/// and writes it to the lower 32 bits of the destination. The remaining
1489	/// higher order elements of the destination vector are copied from the
1490	/// corresponding elements in the first operand.
1491	///
1492	/// \headerfile <x86intrin.h>
1493	///
1494	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1495	///
1496	/// \param __a
1497	/// A 128-bit vector of [4 x float].
1498	/// \param __b
1499	/// A 32-bit signed integer operand containing the value to be converted.
1500	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1501	/// converted value of the second operand. The upper 96 bits are copied from
1502	/// the upper 96 bits of the first operand.
1503	static __inline__ __m128 __DEFAULT_FN_ATTRS
1504	_mm_cvtsi32_ss(__m128 __a, int __b)
1505	{
1506	__a[0] = __b;
1507	return __a;
1508	}
1509
1510	/// Converts a 32-bit signed integer value into a floating point value
1511	/// and writes it to the lower 32 bits of the destination. The remaining
1512	/// higher order elements of the destination are copied from the
1513	/// corresponding elements in the first operand.
1514	///
1515	/// \headerfile <x86intrin.h>
1516	///
1517	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1518	///
1519	/// \param __a
1520	/// A 128-bit vector of [4 x float].
1521	/// \param __b
1522	/// A 32-bit signed integer operand containing the value to be converted.
1523	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1524	/// converted value of the second operand. The upper 96 bits are copied from
1525	/// the upper 96 bits of the first operand.
1526	static __inline__ __m128 __DEFAULT_FN_ATTRS
1527	_mm_cvt_si2ss(__m128 __a, int __b)
1528	{
1529	return _mm_cvtsi32_ss(__a, __b);
1530	}
1531
1532	#ifdef __x86_64__
1533
1534	/// Converts a 64-bit signed integer value into a floating point value
1535	/// and writes it to the lower 32 bits of the destination. The remaining
1536	/// higher order elements of the destination are copied from the
1537	/// corresponding elements in the first operand.
1538	///
1539	/// \headerfile <x86intrin.h>
1540	///
1541	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1542	///
1543	/// \param __a
1544	/// A 128-bit vector of [4 x float].
1545	/// \param __b
1546	/// A 64-bit signed integer operand containing the value to be converted.
1547	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1548	/// converted value of the second operand. The upper 96 bits are copied from
1549	/// the upper 96 bits of the first operand.
1550	static __inline__ __m128 __DEFAULT_FN_ATTRS
1551	_mm_cvtsi64_ss(__m128 __a, long long __b)
1552	{
1553	__a[0] = __b;
1554	return __a;
1555	}
1556
1557	#endif
1558
1559	/// Converts two elements of a 64-bit vector of [2 x i32] into two
1560	/// floating point values and writes them to the lower 64-bits of the
1561	/// destination. The remaining higher order elements of the destination are
1562	/// copied from the corresponding elements in the first operand.
1563	///
1564	/// \headerfile <x86intrin.h>
1565	///
1566	/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1567	///
1568	/// \param __a
1569	/// A 128-bit vector of [4 x float].
1570	/// \param __b
1571	/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1572	/// and written to the corresponding low-order elements in the destination.
1573	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1574	/// converted value of the second operand. The upper 64 bits are copied from
1575	/// the upper 64 bits of the first operand.
1576	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1577	_mm_cvtpi32_ps(__m128 __a, __m64 __b)
1578	{
1579	return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1580	}
1581
1582	/// Converts two elements of a 64-bit vector of [2 x i32] into two
1583	/// floating point values and writes them to the lower 64-bits of the
1584	/// destination. The remaining higher order elements of the destination are
1585	/// copied from the corresponding elements in the first operand.
1586	///
1587	/// \headerfile <x86intrin.h>
1588	///
1589	/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1590	///
1591	/// \param __a
1592	/// A 128-bit vector of [4 x float].
1593	/// \param __b
1594	/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1595	/// and written to the corresponding low-order elements in the destination.
1596	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1597	/// converted value from the second operand. The upper 64 bits are copied
1598	/// from the upper 64 bits of the first operand.
1599	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1600	_mm_cvt_pi2ps(__m128 __a, __m64 __b)
1601	{
1602	return _mm_cvtpi32_ps(__a, __b);
1603	}
1604
1605	/// Extracts a float value contained in the lower 32 bits of a vector of
1606	/// [4 x float].
1607	///
1608	/// \headerfile <x86intrin.h>
1609	///
1610	/// This intrinsic has no corresponding instruction.
1611	///
1612	/// \param __a
1613	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1614	/// used in the extraction.
1615	/// \returns A 32-bit float containing the extracted value.
1616	static __inline__ float __DEFAULT_FN_ATTRS
1617	_mm_cvtss_f32(__m128 __a)
1618	{
1619	return __a[0];
1620	}
1621
1622	/// Loads two packed float values from the address \a __p into the
1623	/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1624	/// are copied from the low-order bits of the first operand.
1625	///
1626	/// \headerfile <x86intrin.h>
1627	///
1628	/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1629	///
1630	/// \param __a
1631	/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1632	/// of the destination.
1633	/// \param __p
1634	/// A pointer to two packed float values. Bits [63:0] are written to bits
1635	/// [127:64] of the destination.
1636	/// \returns A 128-bit vector of [4 x float] containing the moved values.
1637	static __inline__ __m128 __DEFAULT_FN_ATTRS
1638	_mm_loadh_pi(__m128 __a, const __m64 *__p)
1639	{
1640	typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1641	struct __mm_loadh_pi_struct {
1642	__mm_loadh_pi_v2f32 __u;
1643	} __attribute__((__packed__, __may_alias__));
1644	__mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
1645	__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1646	return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1647	}
1648
1649	/// Loads two packed float values from the address \a __p into the
1650	/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1651	/// are copied from the high-order bits of the first operand.
1652	///
1653	/// \headerfile <x86intrin.h>
1654	///
1655	/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1656	///
1657	/// \param __a
1658	/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1659	/// [127:64] of the destination.
1660	/// \param __p
1661	/// A pointer to two packed float values. Bits [63:0] are written to bits
1662	/// [63:0] of the destination.
1663	/// \returns A 128-bit vector of [4 x float] containing the moved values.
1664	static __inline__ __m128 __DEFAULT_FN_ATTRS
1665	_mm_loadl_pi(__m128 __a, const __m64 *__p)
1666	{
1667	typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1668	struct __mm_loadl_pi_struct {
1669	__mm_loadl_pi_v2f32 __u;
1670	} __attribute__((__packed__, __may_alias__));
1671	__mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
1672	__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1673	return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1674	}
1675
1676	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1677	/// 32 bits of the vector are initialized with the single-precision
1678	/// floating-point value loaded from a specified memory location. The upper
1679	/// 96 bits are set to zero.
1680	///
1681	/// \headerfile <x86intrin.h>
1682	///
1683	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1684	///
1685	/// \param __p
1686	/// A pointer to a 32-bit memory location containing a single-precision
1687	/// floating-point value.
1688	/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1689	/// lower 32 bits contain the value loaded from the memory location. The
1690	/// upper 96 bits are set to zero.
1691	static __inline__ __m128 __DEFAULT_FN_ATTRS
1692	_mm_load_ss(const float *__p)
1693	{
1694	struct __mm_load_ss_struct {
1695	float __u;
1696	} __attribute__((__packed__, __may_alias__));
1697	float __u = ((struct __mm_load_ss_struct*)__p)->__u;
1698	return __extension__ (__m128){ __u, 0, 0, 0 };
1699	}
1700
1701	/// Loads a 32-bit float value and duplicates it to all four vector
1702	/// elements of a 128-bit vector of [4 x float].
1703	///
1704	/// \headerfile <x86intrin.h>
1705	///
1706	/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1707	/// instruction.
1708	///
1709	/// \param __p
1710	/// A pointer to a float value to be loaded and duplicated.
1711	/// \returns A 128-bit vector of [4 x float] containing the loaded and
1712	/// duplicated values.
1713	static __inline__ __m128 __DEFAULT_FN_ATTRS
1714	_mm_load1_ps(const float *__p)
1715	{
1716	struct __mm_load1_ps_struct {
1717	float __u;
1718	} __attribute__((__packed__, __may_alias__));
1719	float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
1720	return __extension__ (__m128){ __u, __u, __u, __u };
1721	}
1722
1723	#define _mm_load_ps1(p) _mm_load1_ps(p)
1724
1725	/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1726	/// memory location.
1727	///
1728	/// \headerfile <x86intrin.h>
1729	///
1730	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1731	///
1732	/// \param __p
1733	/// A pointer to a 128-bit memory location. The address of the memory
1734	/// location has to be 128-bit aligned.
1735	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1736	static __inline__ __m128 __DEFAULT_FN_ATTRS
1737	_mm_load_ps(const float *__p)
1738	{
1739	return (__m128)__p;
1740	}
1741
1742	/// Loads a 128-bit floating-point vector of [4 x float] from an
1743	/// unaligned memory location.
1744	///
1745	/// \headerfile <x86intrin.h>
1746	///
1747	/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1748	///
1749	/// \param __p
1750	/// A pointer to a 128-bit memory location. The address of the memory
1751	/// location does not have to be aligned.
1752	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1753	static __inline__ __m128 __DEFAULT_FN_ATTRS
1754	_mm_loadu_ps(const float *__p)
1755	{
1756	struct __loadu_ps {
1757	__m128_u __v;
1758	} __attribute__((__packed__, __may_alias__));
1759	return ((struct __loadu_ps*)__p)->__v;
1760	}
1761
1762	/// Loads four packed float values, in reverse order, from an aligned
1763	/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1764	///
1765	/// \headerfile <x86intrin.h>
1766	///
1767	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1768	/// instruction.
1769	///
1770	/// \param __p
1771	/// A pointer to a 128-bit memory location. The address of the memory
1772	/// location has to be 128-bit aligned.
1773	/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1774	/// in reverse order.
1775	static __inline__ __m128 __DEFAULT_FN_ATTRS
1776	_mm_loadr_ps(const float *__p)
1777	{
1778	__m128 __a = _mm_load_ps(__p);
1779	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1780	}
1781
1782	/// Create a 128-bit vector of [4 x float] with undefined values.
1783	///
1784	/// \headerfile <x86intrin.h>
1785	///
1786	/// This intrinsic has no corresponding instruction.
1787	///
1788	/// \returns A 128-bit vector of [4 x float] containing undefined values.
1789	static __inline__ __m128 __DEFAULT_FN_ATTRS
1790	_mm_undefined_ps(void)
1791	{
1792	return (__m128)__builtin_ia32_undef128();
1793	}
1794
1795	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1796	/// 32 bits of the vector are initialized with the specified single-precision
1797	/// floating-point value. The upper 96 bits are set to zero.
1798	///
1799	/// \headerfile <x86intrin.h>
1800	///
1801	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1802	///
1803	/// \param __w
1804	/// A single-precision floating-point value used to initialize the lower 32
1805	/// bits of the result.
1806	/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1807	/// lower 32 bits contain the value provided in the source operand. The
1808	/// upper 96 bits are set to zero.
1809	static __inline__ __m128 __DEFAULT_FN_ATTRS
1810	_mm_set_ss(float __w)
1811	{
1812	return __extension__ (__m128){ __w, 0, 0, 0 };
1813	}
1814
1815	/// Constructs a 128-bit floating-point vector of [4 x float], with each
1816	/// of the four single-precision floating-point vector elements set to the
1817	/// specified single-precision floating-point value.
1818	///
1819	/// \headerfile <x86intrin.h>
1820	///
1821	/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1822	///
1823	/// \param __w
1824	/// A single-precision floating-point value used to initialize each vector
1825	/// element of the result.
1826	/// \returns An initialized 128-bit floating-point vector of [4 x float].
1827	static __inline__ __m128 __DEFAULT_FN_ATTRS
1828	_mm_set1_ps(float __w)
1829	{
1830	return __extension__ (__m128){ __w, __w, __w, __w };
1831	}
1832
1833	/* Microsoft specific. */
1834	/// Constructs a 128-bit floating-point vector of [4 x float], with each
1835	/// of the four single-precision floating-point vector elements set to the
1836	/// specified single-precision floating-point value.
1837	///
1838	/// \headerfile <x86intrin.h>
1839	///
1840	/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1841	///
1842	/// \param __w
1843	/// A single-precision floating-point value used to initialize each vector
1844	/// element of the result.
1845	/// \returns An initialized 128-bit floating-point vector of [4 x float].
1846	static __inline__ __m128 __DEFAULT_FN_ATTRS
1847	_mm_set_ps1(float __w)
1848	{
1849	return _mm_set1_ps(__w);
1850	}
1851
1852	/// Constructs a 128-bit floating-point vector of [4 x float]
1853	/// initialized with the specified single-precision floating-point values.
1854	///
1855	/// \headerfile <x86intrin.h>
1856	///
1857	/// This intrinsic is a utility function and does not correspond to a specific
1858	/// instruction.
1859	///
1860	/// \param __z
1861	/// A single-precision floating-point value used to initialize bits [127:96]
1862	/// of the result.
1863	/// \param __y
1864	/// A single-precision floating-point value used to initialize bits [95:64]
1865	/// of the result.
1866	/// \param __x
1867	/// A single-precision floating-point value used to initialize bits [63:32]
1868	/// of the result.
1869	/// \param __w
1870	/// A single-precision floating-point value used to initialize bits [31:0]
1871	/// of the result.
1872	/// \returns An initialized 128-bit floating-point vector of [4 x float].
1873	static __inline__ __m128 __DEFAULT_FN_ATTRS
1874	_mm_set_ps(float __z, float __y, float __x, float __w)
1875	{
1876	return __extension__ (__m128){ __w, __x, __y, __z };
1877	}
1878
1879	/// Constructs a 128-bit floating-point vector of [4 x float],
1880	/// initialized in reverse order with the specified 32-bit single-precision
1881	/// float-point values.
1882	///
1883	/// \headerfile <x86intrin.h>
1884	///
1885	/// This intrinsic is a utility function and does not correspond to a specific
1886	/// instruction.
1887	///
1888	/// \param __z
1889	/// A single-precision floating-point value used to initialize bits [31:0]
1890	/// of the result.
1891	/// \param __y
1892	/// A single-precision floating-point value used to initialize bits [63:32]
1893	/// of the result.
1894	/// \param __x
1895	/// A single-precision floating-point value used to initialize bits [95:64]
1896	/// of the result.
1897	/// \param __w
1898	/// A single-precision floating-point value used to initialize bits [127:96]
1899	/// of the result.
1900	/// \returns An initialized 128-bit floating-point vector of [4 x float].
1901	static __inline__ __m128 __DEFAULT_FN_ATTRS
1902	_mm_setr_ps(float __z, float __y, float __x, float __w)
1903	{
1904	return __extension__ (__m128){ __z, __y, __x, __w };
1905	}
1906
1907	/// Constructs a 128-bit floating-point vector of [4 x float] initialized
1908	/// to zero.
1909	///
1910	/// \headerfile <x86intrin.h>
1911	///
1912	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1913	///
1914	/// \returns An initialized 128-bit floating-point vector of [4 x float] with
1915	/// all elements set to zero.
1916	static __inline__ __m128 __DEFAULT_FN_ATTRS
1917	_mm_setzero_ps(void)
1918	{
1919	return __extension__ (__m128){ 0, 0, 0, 0 };
1920	}
1921
1922	/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1923	/// memory location.
1924	///
1925	/// \headerfile <x86intrin.h>
1926	///
1927	/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
1928	///
1929	/// \param __p
1930	/// A pointer to a 64-bit memory location.
1931	/// \param __a
1932	/// A 128-bit vector of [4 x float] containing the values to be stored.
1933	static __inline__ void __DEFAULT_FN_ATTRS
1934	_mm_storeh_pi(__m64 *__p, __m128 __a)
1935	{
1936	__builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
1937	}
1938
1939	/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1940	/// memory location.
1941	///
1942	/// \headerfile <x86intrin.h>
1943	///
1944	/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
1945	///
1946	/// \param __p
1947	/// A pointer to a memory location that will receive the float values.
1948	/// \param __a
1949	/// A 128-bit vector of [4 x float] containing the values to be stored.
1950	static __inline__ void __DEFAULT_FN_ATTRS
1951	_mm_storel_pi(__m64 *__p, __m128 __a)
1952	{
1953	__builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
1954	}
1955
1956	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1957	/// memory location.
1958	///
1959	/// \headerfile <x86intrin.h>
1960	///
1961	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1962	///
1963	/// \param __p
1964	/// A pointer to a 32-bit memory location.
1965	/// \param __a
1966	/// A 128-bit vector of [4 x float] containing the value to be stored.
1967	static __inline__ void __DEFAULT_FN_ATTRS
1968	_mm_store_ss(float *__p, __m128 __a)
1969	{
1970	struct __mm_store_ss_struct {
1971	float __u;
1972	} __attribute__((__packed__, __may_alias__));
1973	((struct __mm_store_ss_struct*)__p)->__u = __a[0];
1974	}
1975
1976	/// Stores a 128-bit vector of [4 x float] to an unaligned memory
1977	/// location.
1978	///
1979	/// \headerfile <x86intrin.h>
1980	///
1981	/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1982	///
1983	/// \param __p
1984	/// A pointer to a 128-bit memory location. The address of the memory
1985	/// location does not have to be aligned.
1986	/// \param __a
1987	/// A 128-bit vector of [4 x float] containing the values to be stored.
1988	static __inline__ void __DEFAULT_FN_ATTRS
1989	_mm_storeu_ps(float *__p, __m128 __a)
1990	{
1991	struct __storeu_ps {
1992	__m128_u __v;
1993	} __attribute__((__packed__, __may_alias__));
1994	((struct __storeu_ps*)__p)->__v = __a;
1995	}
1996
1997	/// Stores a 128-bit vector of [4 x float] into an aligned memory
1998	/// location.
1999	///
2000	/// \headerfile <x86intrin.h>
2001	///
2002	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2003	///
2004	/// \param __p
2005	/// A pointer to a 128-bit memory location. The address of the memory
2006	/// location has to be 16-byte aligned.
2007	/// \param __a
2008	/// A 128-bit vector of [4 x float] containing the values to be stored.
2009	static __inline__ void __DEFAULT_FN_ATTRS
2010	_mm_store_ps(float *__p, __m128 __a)
2011	{
2012	(__m128)__p = __a;
2013	}
2014
2015	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2016	/// four contiguous elements in an aligned memory location.
2017	///
2018	/// \headerfile <x86intrin.h>
2019	///
2020	/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2021	/// instruction.
2022	///
2023	/// \param __p
2024	/// A pointer to a 128-bit memory location.
2025	/// \param __a
2026	/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2027	/// of the four contiguous elements pointed by \a __p.
2028	static __inline__ void __DEFAULT_FN_ATTRS
2029	_mm_store1_ps(float *__p, __m128 __a)
2030	{
2031	__a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2032	_mm_store_ps(__p, __a);
2033	}
2034
2035	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2036	/// four contiguous elements in an aligned memory location.
2037	///
2038	/// \headerfile <x86intrin.h>
2039	///
2040	/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2041	/// instruction.
2042	///
2043	/// \param __p
2044	/// A pointer to a 128-bit memory location.
2045	/// \param __a
2046	/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2047	/// of the four contiguous elements pointed by \a __p.
2048	static __inline__ void __DEFAULT_FN_ATTRS
2049	_mm_store_ps1(float *__p, __m128 __a)
2050	{
2051	_mm_store1_ps(__p, __a);
2052	}
2053
2054	/// Stores float values from a 128-bit vector of [4 x float] to an
2055	/// aligned memory location in reverse order.
2056	///
2057	/// \headerfile <x86intrin.h>
2058	///
2059	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2060	/// instruction.
2061	///
2062	/// \param __p
2063	/// A pointer to a 128-bit memory location. The address of the memory
2064	/// location has to be 128-bit aligned.
2065	/// \param __a
2066	/// A 128-bit vector of [4 x float] containing the values to be stored.
2067	static __inline__ void __DEFAULT_FN_ATTRS
2068	_mm_storer_ps(float *__p, __m128 __a)
2069	{
2070	__a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2071	_mm_store_ps(__p, __a);
2072	}
2073
2074	#define _MM_HINT_ET0 7
2075	#define _MM_HINT_ET1 6
2076	#define _MM_HINT_T0 3
2077	#define _MM_HINT_T1 2
2078	#define _MM_HINT_T2 1
2079	#define _MM_HINT_NTA 0
2080
2081	#ifndef _MSC_VER
2082	/* FIXME: We have to #define this because "sel" must be a constant integer, and
2083	Sema doesn't do any form of constant propagation yet. */
2084
2085	/// Loads one cache line of data from the specified address to a location
2086	/// closer to the processor.
2087	///
2088	/// \headerfile <x86intrin.h>
2089	///
2090	/// \code
2091	/// void _mm_prefetch(const void * a, const int sel);
2092	/// \endcode
2093	///
2094	/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2095	///
2096	/// \param a
2097	/// A pointer to a memory location containing a cache line of data.
2098	/// \param sel
2099	/// A predefined integer constant specifying the type of prefetch
2100	/// operation: \n
2101	/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2102	/// PREFETCHNTA instruction will be generated. \n
2103	/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2104	/// be generated. \n
2105	/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2106	/// be generated. \n
2107	/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2108	/// be generated.
2109	#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), \
2110	((sel) >> 2) & 1, (sel) & 0x3))
2111	#endif
2112
2113	/// Stores a 64-bit integer in the specified aligned memory location. To
2114	/// minimize caching, the data is flagged as non-temporal (unlikely to be
2115	/// used again soon).
2116	///
2117	/// \headerfile <x86intrin.h>
2118	///
2119	/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2120	///
2121	/// \param __p
2122	/// A pointer to an aligned memory location used to store the register value.
2123	/// \param __a
2124	/// A 64-bit integer containing the value to be stored.
2125	static __inline__ void __DEFAULT_FN_ATTRS_MMX
2126	_mm_stream_pi(__m64 *__p, __m64 __a)
2127	{
2128	__builtin_ia32_movntq(__p, __a);
2129	}
2130
2131	/// Moves packed float values from a 128-bit vector of [4 x float] to a
2132	/// 128-bit aligned memory location. To minimize caching, the data is flagged
2133	/// as non-temporal (unlikely to be used again soon).
2134	///
2135	/// \headerfile <x86intrin.h>
2136	///
2137	/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2138	///
2139	/// \param __p
2140	/// A pointer to a 128-bit aligned memory location that will receive the
2141	/// single-precision floating-point values.
2142	/// \param __a
2143	/// A 128-bit vector of [4 x float] containing the values to be moved.
2144	static __inline__ void __DEFAULT_FN_ATTRS
2145	_mm_stream_ps(float *__p, __m128 __a)
2146	{
2147	__builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2148	}
2149
2150	#if defined(__cplusplus)
2151	extern "C" {
2152	#endif
2153
2154	/// Forces strong memory ordering (serialization) between store
2155	/// instructions preceding this instruction and store instructions following
2156	/// this instruction, ensuring the system completes all previous stores
2157	/// before executing subsequent stores.
2158	///
2159	/// \headerfile <x86intrin.h>
2160	///
2161	/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2162	///
2163	void _mm_sfence(void);
2164
2165	#if defined(__cplusplus)
2166	} // extern "C"
2167	#endif
2168
2169	/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2170	/// returns it, as specified by the immediate integer operand.
2171	///
2172	/// \headerfile <x86intrin.h>
2173	///
2174	/// \code
2175	/// int _mm_extract_pi16(__m64 a, int n);
2176	/// \endcode
2177	///
2178	/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2179	///
2180	/// \param a
2181	/// A 64-bit vector of [4 x i16].
2182	/// \param n
2183	/// An immediate integer operand that determines which bits are extracted: \n
2184	/// 0: Bits [15:0] are copied to the destination. \n
2185	/// 1: Bits [31:16] are copied to the destination. \n
2186	/// 2: Bits [47:32] are copied to the destination. \n
2187	/// 3: Bits [63:48] are copied to the destination.
2188	/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2189	#define _mm_extract_pi16(a, n) \
2190	(int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n)
2191
2192	/// Copies data from the 64-bit vector of [4 x i16] to the destination,
2193	/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
2194	/// specified by the immediate operand \a n.
2195	///
2196	/// \headerfile <x86intrin.h>
2197	///
2198	/// \code
2199	/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2200	/// \endcode
2201	///
2202	/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2203	///
2204	/// \param a
2205	/// A 64-bit vector of [4 x i16].
2206	/// \param d
2207	/// An integer. The lower 16-bit value from this operand is written to the
2208	/// destination at the offset specified by operand \a n.
2209	/// \param n
2210	/// An immediate integer operant that determines which the bits to be used
2211	/// in the destination. \n
2212	/// 0: Bits [15:0] are copied to the destination. \n
2213	/// 1: Bits [31:16] are copied to the destination. \n
2214	/// 2: Bits [47:32] are copied to the destination. \n
2215	/// 3: Bits [63:48] are copied to the destination. \n
2216	/// The remaining bits in the destination are copied from the corresponding
2217	/// bits in operand \a a.
2218	/// \returns A 64-bit integer vector containing the copied packed data from the
2219	/// operands.
2220	#define _mm_insert_pi16(a, d, n) \
2221	(__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n)
2222
2223	/// Compares each of the corresponding packed 16-bit integer values of
2224	/// the 64-bit integer vectors, and writes the greater value to the
2225	/// corresponding bits in the destination.
2226	///
2227	/// \headerfile <x86intrin.h>
2228	///
2229	/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2230	///
2231	/// \param __a
2232	/// A 64-bit integer vector containing one of the source operands.
2233	/// \param __b
2234	/// A 64-bit integer vector containing one of the source operands.
2235	/// \returns A 64-bit integer vector containing the comparison results.
2236	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2237	_mm_max_pi16(__m64 __a, __m64 __b)
2238	{
2239	return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2240	}
2241
2242	/// Compares each of the corresponding packed 8-bit unsigned integer
2243	/// values of the 64-bit integer vectors, and writes the greater value to the
2244	/// corresponding bits in the destination.
2245	///
2246	/// \headerfile <x86intrin.h>
2247	///
2248	/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2249	///
2250	/// \param __a
2251	/// A 64-bit integer vector containing one of the source operands.
2252	/// \param __b
2253	/// A 64-bit integer vector containing one of the source operands.
2254	/// \returns A 64-bit integer vector containing the comparison results.
2255	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2256	_mm_max_pu8(__m64 __a, __m64 __b)
2257	{
2258	return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2259	}
2260
2261	/// Compares each of the corresponding packed 16-bit integer values of
2262	/// the 64-bit integer vectors, and writes the lesser value to the
2263	/// corresponding bits in the destination.
2264	///
2265	/// \headerfile <x86intrin.h>
2266	///
2267	/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2268	///
2269	/// \param __a
2270	/// A 64-bit integer vector containing one of the source operands.
2271	/// \param __b
2272	/// A 64-bit integer vector containing one of the source operands.
2273	/// \returns A 64-bit integer vector containing the comparison results.
2274	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2275	_mm_min_pi16(__m64 __a, __m64 __b)
2276	{
2277	return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2278	}
2279
2280	/// Compares each of the corresponding packed 8-bit unsigned integer
2281	/// values of the 64-bit integer vectors, and writes the lesser value to the
2282	/// corresponding bits in the destination.
2283	///
2284	/// \headerfile <x86intrin.h>
2285	///
2286	/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2287	///
2288	/// \param __a
2289	/// A 64-bit integer vector containing one of the source operands.
2290	/// \param __b
2291	/// A 64-bit integer vector containing one of the source operands.
2292	/// \returns A 64-bit integer vector containing the comparison results.
2293	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2294	_mm_min_pu8(__m64 __a, __m64 __b)
2295	{
2296	return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2297	}
2298
2299	/// Takes the most significant bit from each 8-bit element in a 64-bit
2300	/// integer vector to create an 8-bit mask value. Zero-extends the value to
2301	/// 32-bit integer and writes it to the destination.
2302	///
2303	/// \headerfile <x86intrin.h>
2304	///
2305	/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2306	///
2307	/// \param __a
2308	/// A 64-bit integer vector containing the values with bits to be extracted.
2309	/// \returns The most significant bit from each 8-bit element in \a __a,
2310	/// written to bits [7:0].
2311	static __inline__ int __DEFAULT_FN_ATTRS_MMX
2312	_mm_movemask_pi8(__m64 __a)
2313	{
2314	return __builtin_ia32_pmovmskb((__v8qi)__a);
2315	}
2316
2317	/// Multiplies packed 16-bit unsigned integer values and writes the
2318	/// high-order 16 bits of each 32-bit product to the corresponding bits in
2319	/// the destination.
2320	///
2321	/// \headerfile <x86intrin.h>
2322	///
2323	/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2324	///
2325	/// \param __a
2326	/// A 64-bit integer vector containing one of the source operands.
2327	/// \param __b
2328	/// A 64-bit integer vector containing one of the source operands.
2329	/// \returns A 64-bit integer vector containing the products of both operands.
2330	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2331	_mm_mulhi_pu16(__m64 __a, __m64 __b)
2332	{
2333	return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2334	}
2335
2336	/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2337	/// destination, as specified by the immediate value operand.
2338	///
2339	/// \headerfile <x86intrin.h>
2340	///
2341	/// \code
2342	/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2343	/// \endcode
2344	///
2345	/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2346	///
2347	/// \param a
2348	/// A 64-bit integer vector containing the values to be shuffled.
2349	/// \param n
2350	/// An immediate value containing an 8-bit value specifying which elements to
2351	/// copy from \a a. The destinations within the 64-bit destination are
2352	/// assigned values as follows: \n
2353	/// Bits [1:0] are used to assign values to bits [15:0] in the
2354	/// destination. \n
2355	/// Bits [3:2] are used to assign values to bits [31:16] in the
2356	/// destination. \n
2357	/// Bits [5:4] are used to assign values to bits [47:32] in the
2358	/// destination. \n
2359	/// Bits [7:6] are used to assign values to bits [63:48] in the
2360	/// destination. \n
2361	/// Bit value assignments: \n
2362	/// 00: assigned from bits [15:0] of \a a. \n
2363	/// 01: assigned from bits [31:16] of \a a. \n
2364	/// 10: assigned from bits [47:32] of \a a. \n
2365	/// 11: assigned from bits [63:48] of \a a.
2366	/// \returns A 64-bit integer vector containing the shuffled values.
2367	#define _mm_shuffle_pi16(a, n) \
2368	(__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
2369
2370	/// Conditionally copies the values from each 8-bit element in the first
2371	/// 64-bit integer vector operand to the specified memory location, as
2372	/// specified by the most significant bit in the corresponding element in the
2373	/// second 64-bit integer vector operand.
2374	///
2375	/// To minimize caching, the data is flagged as non-temporal
2376	/// (unlikely to be used again soon).
2377	///
2378	/// \headerfile <x86intrin.h>
2379	///
2380	/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2381	///
2382	/// \param __d
2383	/// A 64-bit integer vector containing the values with elements to be copied.
2384	/// \param __n
2385	/// A 64-bit integer vector operand. The most significant bit from each 8-bit
2386	/// element determines whether the corresponding element in operand \a __d
2387	/// is copied. If the most significant bit of a given element is 1, the
2388	/// corresponding element in operand \a __d is copied.
2389	/// \param __p
2390	/// A pointer to a 64-bit memory location that will receive the conditionally
2391	/// copied integer values. The address of the memory location does not have
2392	/// to be aligned.
2393	static __inline__ void __DEFAULT_FN_ATTRS_MMX
2394	_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2395	{
2396	__builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2397	}
2398
2399	/// Computes the rounded averages of the packed unsigned 8-bit integer
2400	/// values and writes the averages to the corresponding bits in the
2401	/// destination.
2402	///
2403	/// \headerfile <x86intrin.h>
2404	///
2405	/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2406	///
2407	/// \param __a
2408	/// A 64-bit integer vector containing one of the source operands.
2409	/// \param __b
2410	/// A 64-bit integer vector containing one of the source operands.
2411	/// \returns A 64-bit integer vector containing the averages of both operands.
2412	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2413	_mm_avg_pu8(__m64 __a, __m64 __b)
2414	{
2415	return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2416	}
2417
2418	/// Computes the rounded averages of the packed unsigned 16-bit integer
2419	/// values and writes the averages to the corresponding bits in the
2420	/// destination.
2421	///
2422	/// \headerfile <x86intrin.h>
2423	///
2424	/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2425	///
2426	/// \param __a
2427	/// A 64-bit integer vector containing one of the source operands.
2428	/// \param __b
2429	/// A 64-bit integer vector containing one of the source operands.
2430	/// \returns A 64-bit integer vector containing the averages of both operands.
2431	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2432	_mm_avg_pu16(__m64 __a, __m64 __b)
2433	{
2434	return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2435	}
2436
2437	/// Subtracts the corresponding 8-bit unsigned integer values of the two
2438	/// 64-bit vector operands and computes the absolute value for each of the
2439	/// difference. Then sum of the 8 absolute differences is written to the
2440	/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2441	///
2442	/// \headerfile <x86intrin.h>
2443	///
2444	/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2445	///
2446	/// \param __a
2447	/// A 64-bit integer vector containing one of the source operands.
2448	/// \param __b
2449	/// A 64-bit integer vector containing one of the source operands.
2450	/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2451	/// sets of absolute differences between both operands. The upper bits are
2452	/// cleared.
2453	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2454	_mm_sad_pu8(__m64 __a, __m64 __b)
2455	{
2456	return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2457	}
2458
2459	#if defined(__cplusplus)
2460	extern "C" {
2461	#endif
2462
2463	/// Returns the contents of the MXCSR register as a 32-bit unsigned
2464	/// integer value.
2465	///
2466	/// There are several groups of macros associated with this
2467	/// intrinsic, including:
2468	/// <ul>
2469	/// <li>
2470	/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2471	/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2472	/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2473	/// _MM_GET_EXCEPTION_STATE().
2474	/// </li>
2475	/// <li>
2476	/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2477	/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2478	/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2479	/// </li>
2480	/// <li>
2481	/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2482	/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2483	/// _MM_GET_ROUNDING_MODE().
2484	/// </li>
2485	/// <li>
2486	/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2487	/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2488	/// </li>
2489	/// <li>
2490	/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2491	/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2492	/// _MM_GET_DENORMALS_ZERO_MODE().
2493	/// </li>
2494	/// </ul>
2495	///
2496	/// For example, the following expression checks if an overflow exception has
2497	/// occurred:
2498	/// \code
2499	/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2500	/// \endcode
2501	///
2502	/// The following expression gets the current rounding mode:
2503	/// \code
2504	/// _MM_GET_ROUNDING_MODE()
2505	/// \endcode
2506	///
2507	/// \headerfile <x86intrin.h>
2508	///
2509	/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2510	///
2511	/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2512	/// register.
2513	unsigned int _mm_getcsr(void);
2514
2515	/// Sets the MXCSR register with the 32-bit unsigned integer value.
2516	///
2517	/// There are several groups of macros associated with this intrinsic,
2518	/// including:
2519	/// <ul>
2520	/// <li>
2521	/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2522	/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2523	/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2524	/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2525	/// </li>
2526	/// <li>
2527	/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2528	/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2529	/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2530	/// of these macros.
2531	/// </li>
2532	/// <li>
2533	/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2534	/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2535	/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2536	/// </li>
2537	/// <li>
2538	/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2539	/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2540	/// one of these macros.
2541	/// </li>
2542	/// <li>
2543	/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2544	/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2545	/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2546	/// </li>
2547	/// </ul>
2548	///
2549	/// For example, the following expression causes subsequent floating-point
2550	/// operations to round up:
2551	/// _mm_setcsr(_mm_getcsr() \| _MM_ROUND_UP)
2552	///
2553	/// The following example sets the DAZ and FTZ flags:
2554	/// \code
2555	/// void setFlags() {
2556	/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2557	/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2558	/// }
2559	/// \endcode
2560	///
2561	/// \headerfile <x86intrin.h>
2562	///
2563	/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2564	///
2565	/// \param __i
2566	/// A 32-bit unsigned integer value to be written to the MXCSR register.
2567	void _mm_setcsr(unsigned int __i);
2568
2569	#if defined(__cplusplus)
2570	} // extern "C"
2571	#endif
2572
2573	/// Selects 4 float values from the 128-bit operands of [4 x float], as
2574	/// specified by the immediate value operand.
2575	///
2576	/// \headerfile <x86intrin.h>
2577	///
2578	/// \code
2579	/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2580	/// \endcode
2581	///
2582	/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2583	///
2584	/// \param a
2585	/// A 128-bit vector of [4 x float].
2586	/// \param b
2587	/// A 128-bit vector of [4 x float].
2588	/// \param mask
2589	/// An immediate value containing an 8-bit value specifying which elements to
2590	/// copy from \a a and \a b. \n
2591	/// Bits [3:0] specify the values copied from operand \a a. \n
2592	/// Bits [7:4] specify the values copied from operand \a b. \n
2593	/// The destinations within the 128-bit destination are assigned values as
2594	/// follows: \n
2595	/// Bits [1:0] are used to assign values to bits [31:0] in the
2596	/// destination. \n
2597	/// Bits [3:2] are used to assign values to bits [63:32] in the
2598	/// destination. \n
2599	/// Bits [5:4] are used to assign values to bits [95:64] in the
2600	/// destination. \n
2601	/// Bits [7:6] are used to assign values to bits [127:96] in the
2602	/// destination. \n
2603	/// Bit value assignments: \n
2604	/// 00: Bits [31:0] copied from the specified operand. \n
2605	/// 01: Bits [63:32] copied from the specified operand. \n
2606	/// 10: Bits [95:64] copied from the specified operand. \n
2607	/// 11: Bits [127:96] copied from the specified operand.
2608	/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2609	#define _mm_shuffle_ps(a, b, mask) \
2610	(__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2611	(int)(mask))
2612
2613	/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2614	/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2615	///
2616	/// \headerfile <x86intrin.h>
2617	///
2618	/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2619	///
2620	/// \param __a
2621	/// A 128-bit vector of [4 x float]. \n
2622	/// Bits [95:64] are written to bits [31:0] of the destination. \n
2623	/// Bits [127:96] are written to bits [95:64] of the destination.
2624	/// \param __b
2625	/// A 128-bit vector of [4 x float].
2626	/// Bits [95:64] are written to bits [63:32] of the destination. \n
2627	/// Bits [127:96] are written to bits [127:96] of the destination.
2628	/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2629	static __inline__ __m128 __DEFAULT_FN_ATTRS
2630	_mm_unpackhi_ps(__m128 __a, __m128 __b)
2631	{
2632	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2633	}
2634
2635	/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2636	/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2637	///
2638	/// \headerfile <x86intrin.h>
2639	///
2640	/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2641	///
2642	/// \param __a
2643	/// A 128-bit vector of [4 x float]. \n
2644	/// Bits [31:0] are written to bits [31:0] of the destination. \n
2645	/// Bits [63:32] are written to bits [95:64] of the destination.
2646	/// \param __b
2647	/// A 128-bit vector of [4 x float]. \n
2648	/// Bits [31:0] are written to bits [63:32] of the destination. \n
2649	/// Bits [63:32] are written to bits [127:96] of the destination.
2650	/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2651	static __inline__ __m128 __DEFAULT_FN_ATTRS
2652	_mm_unpacklo_ps(__m128 __a, __m128 __b)
2653	{
2654	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2655	}
2656
2657	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2658	/// 32 bits are set to the lower 32 bits of the second parameter. The upper
2659	/// 96 bits are set to the upper 96 bits of the first parameter.
2660	///
2661	/// \headerfile <x86intrin.h>
2662	///
2663	/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2664	/// instruction.
2665	///
2666	/// \param __a
2667	/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2668	/// written to the upper 96 bits of the result.
2669	/// \param __b
2670	/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2671	/// written to the lower 32 bits of the result.
2672	/// \returns A 128-bit floating-point vector of [4 x float].
2673	static __inline__ __m128 __DEFAULT_FN_ATTRS
2674	_mm_move_ss(__m128 __a, __m128 __b)
2675	{
2676	__a[0] = __b[0];
2677	return __a;
2678	}
2679
2680	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2681	/// 64 bits are set to the upper 64 bits of the second parameter. The upper
2682	/// 64 bits are set to the upper 64 bits of the first parameter.
2683	///
2684	/// \headerfile <x86intrin.h>
2685	///
2686	/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2687	///
2688	/// \param __a
2689	/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2690	/// written to the upper 64 bits of the result.
2691	/// \param __b
2692	/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2693	/// written to the lower 64 bits of the result.
2694	/// \returns A 128-bit floating-point vector of [4 x float].
2695	static __inline__ __m128 __DEFAULT_FN_ATTRS
2696	_mm_movehl_ps(__m128 __a, __m128 __b)
2697	{
2698	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2699	}
2700
2701	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2702	/// 64 bits are set to the lower 64 bits of the first parameter. The upper
2703	/// 64 bits are set to the lower 64 bits of the second parameter.
2704	///
2705	/// \headerfile <x86intrin.h>
2706	///
2707	/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2708	///
2709	/// \param __a
2710	/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2711	/// written to the lower 64 bits of the result.
2712	/// \param __b
2713	/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2714	/// written to the upper 64 bits of the result.
2715	/// \returns A 128-bit floating-point vector of [4 x float].
2716	static __inline__ __m128 __DEFAULT_FN_ATTRS
2717	_mm_movelh_ps(__m128 __a, __m128 __b)
2718	{
2719	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2720	}
2721
2722	/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2723	/// float].
2724	///
2725	/// \headerfile <x86intrin.h>
2726	///
2727	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2728	///
2729	/// \param __a
2730	/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2731	/// from the corresponding elements in this operand.
2732	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2733	/// values from the operand.
2734	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2735	_mm_cvtpi16_ps(__m64 __a)
2736	{
2737	__m64 __b, __c;
2738	__m128 __r;
2739
2740	__b = _mm_setzero_si64();
2741	__b = _mm_cmpgt_pi16(__b, __a);
2742	__c = _mm_unpackhi_pi16(__a, __b);
2743	__r = _mm_setzero_ps();
2744	__r = _mm_cvtpi32_ps(__r, __c);
2745	__r = _mm_movelh_ps(__r, __r);
2746	__c = _mm_unpacklo_pi16(__a, __b);
2747	__r = _mm_cvtpi32_ps(__r, __c);
2748
2749	return __r;
2750	}
2751
2752	/// Converts a 64-bit vector of 16-bit unsigned integer values into a
2753	/// 128-bit vector of [4 x float].
2754	///
2755	/// \headerfile <x86intrin.h>
2756	///
2757	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2758	///
2759	/// \param __a
2760	/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2761	/// destination are copied from the corresponding elements in this operand.
2762	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2763	/// values from the operand.
2764	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2765	_mm_cvtpu16_ps(__m64 __a)
2766	{
2767	__m64 __b, __c;
2768	__m128 __r;
2769
2770	__b = _mm_setzero_si64();
2771	__c = _mm_unpackhi_pi16(__a, __b);
2772	__r = _mm_setzero_ps();
2773	__r = _mm_cvtpi32_ps(__r, __c);
2774	__r = _mm_movelh_ps(__r, __r);
2775	__c = _mm_unpacklo_pi16(__a, __b);
2776	__r = _mm_cvtpi32_ps(__r, __c);
2777
2778	return __r;
2779	}
2780
2781	/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2782	/// into a 128-bit vector of [4 x float].
2783	///
2784	/// \headerfile <x86intrin.h>
2785	///
2786	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2787	///
2788	/// \param __a
2789	/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2790	/// from the corresponding lower 4 elements in this operand.
2791	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2792	/// values from the operand.
2793	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2794	_mm_cvtpi8_ps(__m64 __a)
2795	{
2796	__m64 __b;
2797
2798	__b = _mm_setzero_si64();
2799	__b = _mm_cmpgt_pi8(__b, __a);
2800	__b = _mm_unpacklo_pi8(__a, __b);
2801
2802	return _mm_cvtpi16_ps(__b);
2803	}
2804
2805	/// Converts the lower four unsigned 8-bit integer values from a 64-bit
2806	/// vector of [8 x u8] into a 128-bit vector of [4 x float].
2807	///
2808	/// \headerfile <x86intrin.h>
2809	///
2810	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2811	///
2812	/// \param __a
2813	/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2814	/// destination are copied from the corresponding lower 4 elements in this
2815	/// operand.
2816	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2817	/// values from the source operand.
2818	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2819	_mm_cvtpu8_ps(__m64 __a)
2820	{
2821	__m64 __b;
2822
2823	__b = _mm_setzero_si64();
2824	__b = _mm_unpacklo_pi8(__a, __b);
2825
2826	return _mm_cvtpi16_ps(__b);
2827	}
2828
2829	/// Converts the two 32-bit signed integer values from each 64-bit vector
2830	/// operand of [2 x i32] into a 128-bit vector of [4 x float].
2831	///
2832	/// \headerfile <x86intrin.h>
2833	///
2834	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2835	///
2836	/// \param __a
2837	/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2838	/// copied from the elements in this operand.
2839	/// \param __b
2840	/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2841	/// copied from the elements in this operand.
2842	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2843	/// copied and converted values from the first operand. The upper 64 bits
2844	/// contain the copied and converted values from the second operand.
2845	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2846	_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2847	{
2848	__m128 __c;
2849
2850	__c = _mm_setzero_ps();
2851	__c = _mm_cvtpi32_ps(__c, __b);
2852	__c = _mm_movelh_ps(__c, __c);
2853
2854	return _mm_cvtpi32_ps(__c, __a);
2855	}
2856
2857	/// Converts each single-precision floating-point element of a 128-bit
2858	/// floating-point vector of [4 x float] into a 16-bit signed integer, and
2859	/// packs the results into a 64-bit integer vector of [4 x i16].
2860	///
2861	/// If the floating-point element is NaN or infinity, or if the
2862	/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2863	/// it is converted to 0x8000. Otherwise if the floating-point element is
2864	/// greater than 0x7FFF, it is converted to 0x7FFF.
2865	///
2866	/// \headerfile <x86intrin.h>
2867	///
2868	/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2869	///
2870	/// \param __a
2871	/// A 128-bit floating-point vector of [4 x float].
2872	/// \returns A 64-bit integer vector of [4 x i16] containing the converted
2873	/// values.
2874	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2875	_mm_cvtps_pi16(__m128 __a)
2876	{
2877	__m64 __b, __c;
2878
2879	__b = _mm_cvtps_pi32(__a);
2880	__a = _mm_movehl_ps(__a, __a);
2881	__c = _mm_cvtps_pi32(__a);
2882
2883	return _mm_packs_pi32(__b, __c);
2884	}
2885
2886	/// Converts each single-precision floating-point element of a 128-bit
2887	/// floating-point vector of [4 x float] into an 8-bit signed integer, and
2888	/// packs the results into the lower 32 bits of a 64-bit integer vector of
2889	/// [8 x i8]. The upper 32 bits of the vector are set to 0.
2890	///
2891	/// If the floating-point element is NaN or infinity, or if the
2892	/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2893	/// is converted to 0x80. Otherwise if the floating-point element is greater
2894	/// than 0x7F, it is converted to 0x7F.
2895	///
2896	/// \headerfile <x86intrin.h>
2897	///
2898	/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2899	///
2900	/// \param __a
2901	/// 128-bit floating-point vector of [4 x float].
2902	/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2903	/// converted values and the uppper 32 bits are set to zero.
2904	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2905	_mm_cvtps_pi8(__m128 __a)
2906	{
2907	__m64 __b, __c;
2908
2909	__b = _mm_cvtps_pi16(__a);
2910	__c = _mm_setzero_si64();
2911
2912	return _mm_packs_pi16(__b, __c);
2913	}
2914
2915	/// Extracts the sign bits from each single-precision floating-point
2916	/// element of a 128-bit floating-point vector of [4 x float] and returns the
2917	/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2918	/// to zero.
2919	///
2920	/// \headerfile <x86intrin.h>
2921	///
2922	/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
2923	///
2924	/// \param __a
2925	/// A 128-bit floating-point vector of [4 x float].
2926	/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2927	/// single-precision floating-point element of the parameter. Bits [31:4] are
2928	/// set to zero.
2929	static __inline__ int __DEFAULT_FN_ATTRS
2930	_mm_movemask_ps(__m128 __a)
2931	{
2932	return __builtin_ia32_movmskps((__v4sf)__a);
2933	}
2934
2935
2936	#define _MM_ALIGN16 __attribute__((aligned(16)))
2937
2938	#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) \| ((y) << 4) \| ((x) << 2) \| (w))
2939
2940	#define _MM_EXCEPT_INVALID (0x0001)
2941	#define _MM_EXCEPT_DENORM (0x0002)
2942	#define _MM_EXCEPT_DIV_ZERO (0x0004)
2943	#define _MM_EXCEPT_OVERFLOW (0x0008)
2944	#define _MM_EXCEPT_UNDERFLOW (0x0010)
2945	#define _MM_EXCEPT_INEXACT (0x0020)
2946	#define _MM_EXCEPT_MASK (0x003f)
2947
2948	#define _MM_MASK_INVALID (0x0080)
2949	#define _MM_MASK_DENORM (0x0100)
2950	#define _MM_MASK_DIV_ZERO (0x0200)
2951	#define _MM_MASK_OVERFLOW (0x0400)
2952	#define _MM_MASK_UNDERFLOW (0x0800)
2953	#define _MM_MASK_INEXACT (0x1000)
2954	#define _MM_MASK_MASK (0x1f80)
2955
2956	#define _MM_ROUND_NEAREST (0x0000)
2957	#define _MM_ROUND_DOWN (0x2000)
2958	#define _MM_ROUND_UP (0x4000)
2959	#define _MM_ROUND_TOWARD_ZERO (0x6000)
2960	#define _MM_ROUND_MASK (0x6000)
2961
2962	#define _MM_FLUSH_ZERO_MASK (0x8000)
2963	#define _MM_FLUSH_ZERO_ON (0x8000)
2964	#define _MM_FLUSH_ZERO_OFF (0x0000)
2965
2966	#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2967	#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2968	#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2969	#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2970
2971	#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) \| (x)))
2972	#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) \| (x)))
2973	#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) \| (x)))
2974	#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) \| (x)))
2975
2976	#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2977	do { \
2978	__m128 tmp3, tmp2, tmp1, tmp0; \
2979	tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2980	tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2981	tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2982	tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2983	(row0) = _mm_movelh_ps(tmp0, tmp2); \
2984	(row1) = _mm_movehl_ps(tmp2, tmp0); \
2985	(row2) = _mm_movelh_ps(tmp1, tmp3); \
2986	(row3) = _mm_movehl_ps(tmp3, tmp1); \
2987	} while (0)
2988
2989	/* Aliases for compatibility. */
2990	#define _m_pextrw _mm_extract_pi16
2991	#define _m_pinsrw _mm_insert_pi16
2992	#define _m_pmaxsw _mm_max_pi16
2993	#define _m_pmaxub _mm_max_pu8
2994	#define _m_pminsw _mm_min_pi16
2995	#define _m_pminub _mm_min_pu8
2996	#define _m_pmovmskb _mm_movemask_pi8
2997	#define _m_pmulhuw _mm_mulhi_pu16
2998	#define _m_pshufw _mm_shuffle_pi16
2999	#define _m_maskmovq _mm_maskmove_si64
3000	#define _m_pavgb _mm_avg_pu8
3001	#define _m_pavgw _mm_avg_pu16
3002	#define _m_psadbw _mm_sad_pu8
3003	#define _m_ _mm_
3004	#define _m_ _mm_
3005
3006	#undef __DEFAULT_FN_ATTRS
3007	#undef __DEFAULT_FN_ATTRS_MMX
3008
3009	/* Ugly hack for backwards-compatibility (compatible with gcc) */
3010	#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3011	#include <emmintrin.h>
3012	#endif
3013
3014	#endif /* __XMMINTRIN_H */
3015

Clang Project