mmintrin.h source code [clang_source_code/lib/Headers/mmintrin.h]

1	/*===---- mmintrin.h - MMX intrinsics --------------------------------------===
2	*
3	* Permission is hereby granted, free of charge, to any person obtaining a copy
4	* of this software and associated documentation files (the "Software"), to deal
5	* in the Software without restriction, including without limitation the rights
6	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7	* copies of the Software, and to permit persons to whom the Software is
8	* furnished to do so, subject to the following conditions:
9	*
10	* The above copyright notice and this permission notice shall be included in
11	* all copies or substantial portions of the Software.
12	*
13	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19	* THE SOFTWARE.
20	*
21	*===-----------------------------------------------------------------------===
22	*/
23
24	#ifndef __MMINTRIN_H
25	#define __MMINTRIN_H
26
27	typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));
28
29	typedef long long __v1di __attribute__((__vector_size__(8)));
30	typedef int __v2si __attribute__((__vector_size__(8)));
31	typedef short __v4hi __attribute__((__vector_size__(8)));
32	typedef char __v8qi __attribute__((__vector_size__(8)));
33
34	/* Define the default attributes for the functions in this file. */
35	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64)))
36
37	/// Clears the MMX state by setting the state of the x87 stack registers
38	/// to empty.
39	///
40	/// \headerfile <x86intrin.h>
41	///
42	/// This intrinsic corresponds to the <c> EMMS </c> instruction.
43	///
44	static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
45	_mm_empty(void)
46	{
47	__builtin_ia32_emms();
48	}
49
50	/// Constructs a 64-bit integer vector, setting the lower 32 bits to the
51	/// value of the 32-bit integer parameter and setting the upper 32 bits to 0.
52	///
53	/// \headerfile <x86intrin.h>
54	///
55	/// This intrinsic corresponds to the <c> MOVD </c> instruction.
56	///
57	/// \param __i
58	/// A 32-bit integer value.
59	/// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
60	/// parameter. The upper 32 bits are set to 0.
61	static __inline__ __m64 __DEFAULT_FN_ATTRS
62	_mm_cvtsi32_si64(int __i)
63	{
64	return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
65	}
66
67	/// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
68	/// signed integer.
69	///
70	/// \headerfile <x86intrin.h>
71	///
72	/// This intrinsic corresponds to the <c> MOVD </c> instruction.
73	///
74	/// \param __m
75	/// A 64-bit integer vector.
76	/// \returns A 32-bit signed integer value containing the lower 32 bits of the
77	/// parameter.
78	static __inline__ int __DEFAULT_FN_ATTRS
79	_mm_cvtsi64_si32(__m64 __m)
80	{
81	return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
82	}
83
84	/// Casts a 64-bit signed integer value into a 64-bit integer vector.
85	///
86	/// \headerfile <x86intrin.h>
87	///
88	/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
89	///
90	/// \param __i
91	/// A 64-bit signed integer.
92	/// \returns A 64-bit integer vector containing the same bitwise pattern as the
93	/// parameter.
94	static __inline__ __m64 __DEFAULT_FN_ATTRS
95	_mm_cvtsi64_m64(long long __i)
96	{
97	return (__m64)__i;
98	}
99
100	/// Casts a 64-bit integer vector into a 64-bit signed integer value.
101	///
102	/// \headerfile <x86intrin.h>
103	///
104	/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
105	///
106	/// \param __m
107	/// A 64-bit integer vector.
108	/// \returns A 64-bit signed integer containing the same bitwise pattern as the
109	/// parameter.
110	static __inline__ long long __DEFAULT_FN_ATTRS
111	_mm_cvtm64_si64(__m64 __m)
112	{
113	return (long long)__m;
114	}
115
116	/// Converts 16-bit signed integers from both 64-bit integer vector
117	/// parameters of [4 x i16] into 8-bit signed integer values, and constructs
118	/// a 64-bit integer vector of [8 x i8] as the result. Positive values
119	/// greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
120	/// are saturated to 0x80.
121	///
122	/// \headerfile <x86intrin.h>
123	///
124	/// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
125	///
126	/// \param __m1
127	/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
128	/// 16-bit signed integer and is converted to an 8-bit signed integer with
129	/// saturation. Positive values greater than 0x7F are saturated to 0x7F.
130	/// Negative values less than 0x80 are saturated to 0x80. The converted
131	/// [4 x i8] values are written to the lower 32 bits of the result.
132	/// \param __m2
133	/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
134	/// 16-bit signed integer and is converted to an 8-bit signed integer with
135	/// saturation. Positive values greater than 0x7F are saturated to 0x7F.
136	/// Negative values less than 0x80 are saturated to 0x80. The converted
137	/// [4 x i8] values are written to the upper 32 bits of the result.
138	/// \returns A 64-bit integer vector of [8 x i8] containing the converted
139	/// values.
140	static __inline__ __m64 __DEFAULT_FN_ATTRS
141	_mm_packs_pi16(__m64 __m1, __m64 __m2)
142	{
143	return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
144	}
145
146	/// Converts 32-bit signed integers from both 64-bit integer vector
147	/// parameters of [2 x i32] into 16-bit signed integer values, and constructs
148	/// a 64-bit integer vector of [4 x i16] as the result. Positive values
149	/// greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
150	/// 0x8000 are saturated to 0x8000.
151	///
152	/// \headerfile <x86intrin.h>
153	///
154	/// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
155	///
156	/// \param __m1
157	/// A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
158	/// 32-bit signed integer and is converted to a 16-bit signed integer with
159	/// saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
160	/// Negative values less than 0x8000 are saturated to 0x8000. The converted
161	/// [2 x i16] values are written to the lower 32 bits of the result.
162	/// \param __m2
163	/// A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
164	/// 32-bit signed integer and is converted to a 16-bit signed integer with
165	/// saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
166	/// Negative values less than 0x8000 are saturated to 0x8000. The converted
167	/// [2 x i16] values are written to the upper 32 bits of the result.
168	/// \returns A 64-bit integer vector of [4 x i16] containing the converted
169	/// values.
170	static __inline__ __m64 __DEFAULT_FN_ATTRS
171	_mm_packs_pi32(__m64 __m1, __m64 __m2)
172	{
173	return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
174	}
175
176	/// Converts 16-bit signed integers from both 64-bit integer vector
177	/// parameters of [4 x i16] into 8-bit unsigned integer values, and
178	/// constructs a 64-bit integer vector of [8 x i8] as the result. Values
179	/// greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
180	/// to 0.
181	///
182	/// \headerfile <x86intrin.h>
183	///
184	/// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
185	///
186	/// \param __m1
187	/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
188	/// 16-bit signed integer and is converted to an 8-bit unsigned integer with
189	/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
190	/// than 0 are saturated to 0. The converted [4 x i8] values are written to
191	/// the lower 32 bits of the result.
192	/// \param __m2
193	/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
194	/// 16-bit signed integer and is converted to an 8-bit unsigned integer with
195	/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
196	/// than 0 are saturated to 0. The converted [4 x i8] values are written to
197	/// the upper 32 bits of the result.
198	/// \returns A 64-bit integer vector of [8 x i8] containing the converted
199	/// values.
200	static __inline__ __m64 __DEFAULT_FN_ATTRS
201	_mm_packs_pu16(__m64 __m1, __m64 __m2)
202	{
203	return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
204	}
205
206	/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
207	/// and interleaves them into a 64-bit integer vector of [8 x i8].
208	///
209	/// \headerfile <x86intrin.h>
210	///
211	/// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
212	///
213	/// \param __m1
214	/// A 64-bit integer vector of [8 x i8]. \n
215	/// Bits [39:32] are written to bits [7:0] of the result. \n
216	/// Bits [47:40] are written to bits [23:16] of the result. \n
217	/// Bits [55:48] are written to bits [39:32] of the result. \n
218	/// Bits [63:56] are written to bits [55:48] of the result.
219	/// \param __m2
220	/// A 64-bit integer vector of [8 x i8].
221	/// Bits [39:32] are written to bits [15:8] of the result. \n
222	/// Bits [47:40] are written to bits [31:24] of the result. \n
223	/// Bits [55:48] are written to bits [47:40] of the result. \n
224	/// Bits [63:56] are written to bits [63:56] of the result.
225	/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
226	/// values.
227	static __inline__ __m64 __DEFAULT_FN_ATTRS
228	_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
229	{
230	return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
231	}
232
233	/// Unpacks the upper 32 bits from two 64-bit integer vectors of
234	/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
235	///
236	/// \headerfile <x86intrin.h>
237	///
238	/// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
239	///
240	/// \param __m1
241	/// A 64-bit integer vector of [4 x i16].
242	/// Bits [47:32] are written to bits [15:0] of the result. \n
243	/// Bits [63:48] are written to bits [47:32] of the result.
244	/// \param __m2
245	/// A 64-bit integer vector of [4 x i16].
246	/// Bits [47:32] are written to bits [31:16] of the result. \n
247	/// Bits [63:48] are written to bits [63:48] of the result.
248	/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
249	/// values.
250	static __inline__ __m64 __DEFAULT_FN_ATTRS
251	_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
252	{
253	return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
254	}
255
256	/// Unpacks the upper 32 bits from two 64-bit integer vectors of
257	/// [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
258	///
259	/// \headerfile <x86intrin.h>
260	///
261	/// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
262	///
263	/// \param __m1
264	/// A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
265	/// the lower 32 bits of the result.
266	/// \param __m2
267	/// A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
268	/// the upper 32 bits of the result.
269	/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
270	/// values.
271	static __inline__ __m64 __DEFAULT_FN_ATTRS
272	_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
273	{
274	return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
275	}
276
277	/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
278	/// and interleaves them into a 64-bit integer vector of [8 x i8].
279	///
280	/// \headerfile <x86intrin.h>
281	///
282	/// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
283	///
284	/// \param __m1
285	/// A 64-bit integer vector of [8 x i8].
286	/// Bits [7:0] are written to bits [7:0] of the result. \n
287	/// Bits [15:8] are written to bits [23:16] of the result. \n
288	/// Bits [23:16] are written to bits [39:32] of the result. \n
289	/// Bits [31:24] are written to bits [55:48] of the result.
290	/// \param __m2
291	/// A 64-bit integer vector of [8 x i8].
292	/// Bits [7:0] are written to bits [15:8] of the result. \n
293	/// Bits [15:8] are written to bits [31:24] of the result. \n
294	/// Bits [23:16] are written to bits [47:40] of the result. \n
295	/// Bits [31:24] are written to bits [63:56] of the result.
296	/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
297	/// values.
298	static __inline__ __m64 __DEFAULT_FN_ATTRS
299	_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
300	{
301	return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
302	}
303
304	/// Unpacks the lower 32 bits from two 64-bit integer vectors of
305	/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
306	///
307	/// \headerfile <x86intrin.h>
308	///
309	/// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
310	///
311	/// \param __m1
312	/// A 64-bit integer vector of [4 x i16].
313	/// Bits [15:0] are written to bits [15:0] of the result. \n
314	/// Bits [31:16] are written to bits [47:32] of the result.
315	/// \param __m2
316	/// A 64-bit integer vector of [4 x i16].
317	/// Bits [15:0] are written to bits [31:16] of the result. \n
318	/// Bits [31:16] are written to bits [63:48] of the result.
319	/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
320	/// values.
321	static __inline__ __m64 __DEFAULT_FN_ATTRS
322	_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
323	{
324	return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
325	}
326
327	/// Unpacks the lower 32 bits from two 64-bit integer vectors of
328	/// [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
329	///
330	/// \headerfile <x86intrin.h>
331	///
332	/// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
333	///
334	/// \param __m1
335	/// A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
336	/// the lower 32 bits of the result.
337	/// \param __m2
338	/// A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
339	/// the upper 32 bits of the result.
340	/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
341	/// values.
342	static __inline__ __m64 __DEFAULT_FN_ATTRS
343	_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
344	{
345	return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
346	}
347
348	/// Adds each 8-bit integer element of the first 64-bit integer vector
349	/// of [8 x i8] to the corresponding 8-bit integer element of the second
350	/// 64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
351	/// packed into a 64-bit integer vector of [8 x i8].
352	///
353	/// \headerfile <x86intrin.h>
354	///
355	/// This intrinsic corresponds to the <c> PADDB </c> instruction.
356	///
357	/// \param __m1
358	/// A 64-bit integer vector of [8 x i8].
359	/// \param __m2
360	/// A 64-bit integer vector of [8 x i8].
361	/// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
362	/// parameters.
363	static __inline__ __m64 __DEFAULT_FN_ATTRS
364	_mm_add_pi8(__m64 __m1, __m64 __m2)
365	{
366	return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
367	}
368
369	/// Adds each 16-bit integer element of the first 64-bit integer vector
370	/// of [4 x i16] to the corresponding 16-bit integer element of the second
371	/// 64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
372	/// packed into a 64-bit integer vector of [4 x i16].
373	///
374	/// \headerfile <x86intrin.h>
375	///
376	/// This intrinsic corresponds to the <c> PADDW </c> instruction.
377	///
378	/// \param __m1
379	/// A 64-bit integer vector of [4 x i16].
380	/// \param __m2
381	/// A 64-bit integer vector of [4 x i16].
382	/// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
383	/// parameters.
384	static __inline__ __m64 __DEFAULT_FN_ATTRS
385	_mm_add_pi16(__m64 __m1, __m64 __m2)
386	{
387	return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
388	}
389
390	/// Adds each 32-bit integer element of the first 64-bit integer vector
391	/// of [2 x i32] to the corresponding 32-bit integer element of the second
392	/// 64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
393	/// packed into a 64-bit integer vector of [2 x i32].
394	///
395	/// \headerfile <x86intrin.h>
396	///
397	/// This intrinsic corresponds to the <c> PADDD </c> instruction.
398	///
399	/// \param __m1
400	/// A 64-bit integer vector of [2 x i32].
401	/// \param __m2
402	/// A 64-bit integer vector of [2 x i32].
403	/// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
404	/// parameters.
405	static __inline__ __m64 __DEFAULT_FN_ATTRS
406	_mm_add_pi32(__m64 __m1, __m64 __m2)
407	{
408	return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
409	}
410
411	/// Adds each 8-bit signed integer element of the first 64-bit integer
412	/// vector of [8 x i8] to the corresponding 8-bit signed integer element of
413	/// the second 64-bit integer vector of [8 x i8]. Positive sums greater than
414	/// 0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
415	/// 0x80. The results are packed into a 64-bit integer vector of [8 x i8].
416	///
417	/// \headerfile <x86intrin.h>
418	///
419	/// This intrinsic corresponds to the <c> PADDSB </c> instruction.
420	///
421	/// \param __m1
422	/// A 64-bit integer vector of [8 x i8].
423	/// \param __m2
424	/// A 64-bit integer vector of [8 x i8].
425	/// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
426	/// of both parameters.
427	static __inline__ __m64 __DEFAULT_FN_ATTRS
428	_mm_adds_pi8(__m64 __m1, __m64 __m2)
429	{
430	return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
431	}
432
433	/// Adds each 16-bit signed integer element of the first 64-bit integer
434	/// vector of [4 x i16] to the corresponding 16-bit signed integer element of
435	/// the second 64-bit integer vector of [4 x i16]. Positive sums greater than
436	/// 0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
437	/// saturated to 0x8000. The results are packed into a 64-bit integer vector
438	/// of [4 x i16].
439	///
440	/// \headerfile <x86intrin.h>
441	///
442	/// This intrinsic corresponds to the <c> PADDSW </c> instruction.
443	///
444	/// \param __m1
445	/// A 64-bit integer vector of [4 x i16].
446	/// \param __m2
447	/// A 64-bit integer vector of [4 x i16].
448	/// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
449	/// of both parameters.
450	static __inline__ __m64 __DEFAULT_FN_ATTRS
451	_mm_adds_pi16(__m64 __m1, __m64 __m2)
452	{
453	return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
454	}
455
456	/// Adds each 8-bit unsigned integer element of the first 64-bit integer
457	/// vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
458	/// the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
459	/// saturated to 0xFF. The results are packed into a 64-bit integer vector of
460	/// [8 x i8].
461	///
462	/// \headerfile <x86intrin.h>
463	///
464	/// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
465	///
466	/// \param __m1
467	/// A 64-bit integer vector of [8 x i8].
468	/// \param __m2
469	/// A 64-bit integer vector of [8 x i8].
470	/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
471	/// unsigned sums of both parameters.
472	static __inline__ __m64 __DEFAULT_FN_ATTRS
473	_mm_adds_pu8(__m64 __m1, __m64 __m2)
474	{
475	return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
476	}
477
478	/// Adds each 16-bit unsigned integer element of the first 64-bit integer
479	/// vector of [4 x i16] to the corresponding 16-bit unsigned integer element
480	/// of the second 64-bit integer vector of [4 x i16]. Sums greater than
481	/// 0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
482	/// integer vector of [4 x i16].
483	///
484	/// \headerfile <x86intrin.h>
485	///
486	/// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
487	///
488	/// \param __m1
489	/// A 64-bit integer vector of [4 x i16].
490	/// \param __m2
491	/// A 64-bit integer vector of [4 x i16].
492	/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
493	/// unsigned sums of both parameters.
494	static __inline__ __m64 __DEFAULT_FN_ATTRS
495	_mm_adds_pu16(__m64 __m1, __m64 __m2)
496	{
497	return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
498	}
499
500	/// Subtracts each 8-bit integer element of the second 64-bit integer
501	/// vector of [8 x i8] from the corresponding 8-bit integer element of the
502	/// first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
503	/// are packed into a 64-bit integer vector of [8 x i8].
504	///
505	/// \headerfile <x86intrin.h>
506	///
507	/// This intrinsic corresponds to the <c> PSUBB </c> instruction.
508	///
509	/// \param __m1
510	/// A 64-bit integer vector of [8 x i8] containing the minuends.
511	/// \param __m2
512	/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
513	/// \returns A 64-bit integer vector of [8 x i8] containing the differences of
514	/// both parameters.
515	static __inline__ __m64 __DEFAULT_FN_ATTRS
516	_mm_sub_pi8(__m64 __m1, __m64 __m2)
517	{
518	return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
519	}
520
521	/// Subtracts each 16-bit integer element of the second 64-bit integer
522	/// vector of [4 x i16] from the corresponding 16-bit integer element of the
523	/// first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
524	/// results are packed into a 64-bit integer vector of [4 x i16].
525	///
526	/// \headerfile <x86intrin.h>
527	///
528	/// This intrinsic corresponds to the <c> PSUBW </c> instruction.
529	///
530	/// \param __m1
531	/// A 64-bit integer vector of [4 x i16] containing the minuends.
532	/// \param __m2
533	/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
534	/// \returns A 64-bit integer vector of [4 x i16] containing the differences of
535	/// both parameters.
536	static __inline__ __m64 __DEFAULT_FN_ATTRS
537	_mm_sub_pi16(__m64 __m1, __m64 __m2)
538	{
539	return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
540	}
541
542	/// Subtracts each 32-bit integer element of the second 64-bit integer
543	/// vector of [2 x i32] from the corresponding 32-bit integer element of the
544	/// first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
545	/// results are packed into a 64-bit integer vector of [2 x i32].
546	///
547	/// \headerfile <x86intrin.h>
548	///
549	/// This intrinsic corresponds to the <c> PSUBD </c> instruction.
550	///
551	/// \param __m1
552	/// A 64-bit integer vector of [2 x i32] containing the minuends.
553	/// \param __m2
554	/// A 64-bit integer vector of [2 x i32] containing the subtrahends.
555	/// \returns A 64-bit integer vector of [2 x i32] containing the differences of
556	/// both parameters.
557	static __inline__ __m64 __DEFAULT_FN_ATTRS
558	_mm_sub_pi32(__m64 __m1, __m64 __m2)
559	{
560	return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
561	}
562
563	/// Subtracts each 8-bit signed integer element of the second 64-bit
564	/// integer vector of [8 x i8] from the corresponding 8-bit signed integer
565	/// element of the first 64-bit integer vector of [8 x i8]. Positive results
566	/// greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
567	/// are saturated to 0x80. The results are packed into a 64-bit integer
568	/// vector of [8 x i8].
569	///
570	/// \headerfile <x86intrin.h>
571	///
572	/// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
573	///
574	/// \param __m1
575	/// A 64-bit integer vector of [8 x i8] containing the minuends.
576	/// \param __m2
577	/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
578	/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
579	/// differences of both parameters.
580	static __inline__ __m64 __DEFAULT_FN_ATTRS
581	_mm_subs_pi8(__m64 __m1, __m64 __m2)
582	{
583	return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
584	}
585
586	/// Subtracts each 16-bit signed integer element of the second 64-bit
587	/// integer vector of [4 x i16] from the corresponding 16-bit signed integer
588	/// element of the first 64-bit integer vector of [4 x i16]. Positive results
589	/// greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
590	/// 0x8000 are saturated to 0x8000. The results are packed into a 64-bit
591	/// integer vector of [4 x i16].
592	///
593	/// \headerfile <x86intrin.h>
594	///
595	/// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
596	///
597	/// \param __m1
598	/// A 64-bit integer vector of [4 x i16] containing the minuends.
599	/// \param __m2
600	/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
601	/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
602	/// differences of both parameters.
603	static __inline__ __m64 __DEFAULT_FN_ATTRS
604	_mm_subs_pi16(__m64 __m1, __m64 __m2)
605	{
606	return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
607	}
608
609	/// Subtracts each 8-bit unsigned integer element of the second 64-bit
610	/// integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
611	/// element of the first 64-bit integer vector of [8 x i8].
612	///
613	/// If an element of the first vector is less than the corresponding element
614	/// of the second vector, the result is saturated to 0. The results are
615	/// packed into a 64-bit integer vector of [8 x i8].
616	///
617	/// \headerfile <x86intrin.h>
618	///
619	/// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
620	///
621	/// \param __m1
622	/// A 64-bit integer vector of [8 x i8] containing the minuends.
623	/// \param __m2
624	/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
625	/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
626	/// differences of both parameters.
627	static __inline__ __m64 __DEFAULT_FN_ATTRS
628	_mm_subs_pu8(__m64 __m1, __m64 __m2)
629	{
630	return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
631	}
632
633	/// Subtracts each 16-bit unsigned integer element of the second 64-bit
634	/// integer vector of [4 x i16] from the corresponding 16-bit unsigned
635	/// integer element of the first 64-bit integer vector of [4 x i16].
636	///
637	/// If an element of the first vector is less than the corresponding element
638	/// of the second vector, the result is saturated to 0. The results are
639	/// packed into a 64-bit integer vector of [4 x i16].
640	///
641	/// \headerfile <x86intrin.h>
642	///
643	/// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
644	///
645	/// \param __m1
646	/// A 64-bit integer vector of [4 x i16] containing the minuends.
647	/// \param __m2
648	/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
649	/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
650	/// differences of both parameters.
651	static __inline__ __m64 __DEFAULT_FN_ATTRS
652	_mm_subs_pu16(__m64 __m1, __m64 __m2)
653	{
654	return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
655	}
656
657	/// Multiplies each 16-bit signed integer element of the first 64-bit
658	/// integer vector of [4 x i16] by the corresponding 16-bit signed integer
659	/// element of the second 64-bit integer vector of [4 x i16] and get four
660	/// 32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
661	/// The lower 32 bits of these two sums are packed into a 64-bit integer
662	/// vector of [2 x i32].
663	///
664	/// For example, bits [15:0] of both parameters are multiplied, bits [31:16]
665	/// of both parameters are multiplied, and the sum of both results is written
666	/// to bits [31:0] of the result.
667	///
668	/// \headerfile <x86intrin.h>
669	///
670	/// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
671	///
672	/// \param __m1
673	/// A 64-bit integer vector of [4 x i16].
674	/// \param __m2
675	/// A 64-bit integer vector of [4 x i16].
676	/// \returns A 64-bit integer vector of [2 x i32] containing the sums of
677	/// products of both parameters.
678	static __inline__ __m64 __DEFAULT_FN_ATTRS
679	_mm_madd_pi16(__m64 __m1, __m64 __m2)
680	{
681	return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
682	}
683
684	/// Multiplies each 16-bit signed integer element of the first 64-bit
685	/// integer vector of [4 x i16] by the corresponding 16-bit signed integer
686	/// element of the second 64-bit integer vector of [4 x i16]. Packs the upper
687	/// 16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
688	///
689	/// \headerfile <x86intrin.h>
690	///
691	/// This intrinsic corresponds to the <c> PMULHW </c> instruction.
692	///
693	/// \param __m1
694	/// A 64-bit integer vector of [4 x i16].
695	/// \param __m2
696	/// A 64-bit integer vector of [4 x i16].
697	/// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
698	/// of the products of both parameters.
699	static __inline__ __m64 __DEFAULT_FN_ATTRS
700	_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
701	{
702	return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
703	}
704
705	/// Multiplies each 16-bit signed integer element of the first 64-bit
706	/// integer vector of [4 x i16] by the corresponding 16-bit signed integer
707	/// element of the second 64-bit integer vector of [4 x i16]. Packs the lower
708	/// 16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
709	///
710	/// \headerfile <x86intrin.h>
711	///
712	/// This intrinsic corresponds to the <c> PMULLW </c> instruction.
713	///
714	/// \param __m1
715	/// A 64-bit integer vector of [4 x i16].
716	/// \param __m2
717	/// A 64-bit integer vector of [4 x i16].
718	/// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
719	/// of the products of both parameters.
720	static __inline__ __m64 __DEFAULT_FN_ATTRS
721	_mm_mullo_pi16(__m64 __m1, __m64 __m2)
722	{
723	return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
724	}
725
726	/// Left-shifts each 16-bit signed integer element of the first
727	/// parameter, which is a 64-bit integer vector of [4 x i16], by the number
728	/// of bits specified by the second parameter, which is a 64-bit integer. The
729	/// lower 16 bits of the results are packed into a 64-bit integer vector of
730	/// [4 x i16].
731	///
732	/// \headerfile <x86intrin.h>
733	///
734	/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
735	///
736	/// \param __m
737	/// A 64-bit integer vector of [4 x i16].
738	/// \param __count
739	/// A 64-bit integer vector interpreted as a single 64-bit integer.
740	/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
741	/// values. If \a __count is greater or equal to 16, the result is set to all
742	/// 0.
743	static __inline__ __m64 __DEFAULT_FN_ATTRS
744	_mm_sll_pi16(__m64 __m, __m64 __count)
745	{
746	return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
747	}
748
749	/// Left-shifts each 16-bit signed integer element of a 64-bit integer
750	/// vector of [4 x i16] by the number of bits specified by a 32-bit integer.
751	/// The lower 16 bits of the results are packed into a 64-bit integer vector
752	/// of [4 x i16].
753	///
754	/// \headerfile <x86intrin.h>
755	///
756	/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
757	///
758	/// \param __m
759	/// A 64-bit integer vector of [4 x i16].
760	/// \param __count
761	/// A 32-bit integer value.
762	/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
763	/// values. If \a __count is greater or equal to 16, the result is set to all
764	/// 0.
765	static __inline__ __m64 __DEFAULT_FN_ATTRS
766	_mm_slli_pi16(__m64 __m, int __count)
767	{
768	return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
769	}
770
771	/// Left-shifts each 32-bit signed integer element of the first
772	/// parameter, which is a 64-bit integer vector of [2 x i32], by the number
773	/// of bits specified by the second parameter, which is a 64-bit integer. The
774	/// lower 32 bits of the results are packed into a 64-bit integer vector of
775	/// [2 x i32].
776	///
777	/// \headerfile <x86intrin.h>
778	///
779	/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
780	///
781	/// \param __m
782	/// A 64-bit integer vector of [2 x i32].
783	/// \param __count
784	/// A 64-bit integer vector interpreted as a single 64-bit integer.
785	/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
786	/// values. If \a __count is greater or equal to 32, the result is set to all
787	/// 0.
788	static __inline__ __m64 __DEFAULT_FN_ATTRS
789	_mm_sll_pi32(__m64 __m, __m64 __count)
790	{
791	return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
792	}
793
794	/// Left-shifts each 32-bit signed integer element of a 64-bit integer
795	/// vector of [2 x i32] by the number of bits specified by a 32-bit integer.
796	/// The lower 32 bits of the results are packed into a 64-bit integer vector
797	/// of [2 x i32].
798	///
799	/// \headerfile <x86intrin.h>
800	///
801	/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
802	///
803	/// \param __m
804	/// A 64-bit integer vector of [2 x i32].
805	/// \param __count
806	/// A 32-bit integer value.
807	/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
808	/// values. If \a __count is greater or equal to 32, the result is set to all
809	/// 0.
810	static __inline__ __m64 __DEFAULT_FN_ATTRS
811	_mm_slli_pi32(__m64 __m, int __count)
812	{
813	return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
814	}
815
816	/// Left-shifts the first 64-bit integer parameter by the number of bits
817	/// specified by the second 64-bit integer parameter. The lower 64 bits of
818	/// result are returned.
819	///
820	/// \headerfile <x86intrin.h>
821	///
822	/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
823	///
824	/// \param __m
825	/// A 64-bit integer vector interpreted as a single 64-bit integer.
826	/// \param __count
827	/// A 64-bit integer vector interpreted as a single 64-bit integer.
828	/// \returns A 64-bit integer vector containing the left-shifted value. If
829	/// \a __count is greater or equal to 64, the result is set to 0.
830	static __inline__ __m64 __DEFAULT_FN_ATTRS
831	_mm_sll_si64(__m64 __m, __m64 __count)
832	{
833	return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
834	}
835
836	/// Left-shifts the first parameter, which is a 64-bit integer, by the
837	/// number of bits specified by the second parameter, which is a 32-bit
838	/// integer. The lower 64 bits of result are returned.
839	///
840	/// \headerfile <x86intrin.h>
841	///
842	/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
843	///
844	/// \param __m
845	/// A 64-bit integer vector interpreted as a single 64-bit integer.
846	/// \param __count
847	/// A 32-bit integer value.
848	/// \returns A 64-bit integer vector containing the left-shifted value. If
849	/// \a __count is greater or equal to 64, the result is set to 0.
850	static __inline__ __m64 __DEFAULT_FN_ATTRS
851	_mm_slli_si64(__m64 __m, int __count)
852	{
853	return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
854	}
855
856	/// Right-shifts each 16-bit integer element of the first parameter,
857	/// which is a 64-bit integer vector of [4 x i16], by the number of bits
858	/// specified by the second parameter, which is a 64-bit integer.
859	///
860	/// High-order bits are filled with the sign bit of the initial value of each
861	/// 16-bit element. The 16-bit results are packed into a 64-bit integer
862	/// vector of [4 x i16].
863	///
864	/// \headerfile <x86intrin.h>
865	///
866	/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
867	///
868	/// \param __m
869	/// A 64-bit integer vector of [4 x i16].
870	/// \param __count
871	/// A 64-bit integer vector interpreted as a single 64-bit integer.
872	/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
873	/// values.
874	static __inline__ __m64 __DEFAULT_FN_ATTRS
875	_mm_sra_pi16(__m64 __m, __m64 __count)
876	{
877	return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
878	}
879
880	/// Right-shifts each 16-bit integer element of a 64-bit integer vector
881	/// of [4 x i16] by the number of bits specified by a 32-bit integer.
882	///
883	/// High-order bits are filled with the sign bit of the initial value of each
884	/// 16-bit element. The 16-bit results are packed into a 64-bit integer
885	/// vector of [4 x i16].
886	///
887	/// \headerfile <x86intrin.h>
888	///
889	/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
890	///
891	/// \param __m
892	/// A 64-bit integer vector of [4 x i16].
893	/// \param __count
894	/// A 32-bit integer value.
895	/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
896	/// values.
897	static __inline__ __m64 __DEFAULT_FN_ATTRS
898	_mm_srai_pi16(__m64 __m, int __count)
899	{
900	return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
901	}
902
903	/// Right-shifts each 32-bit integer element of the first parameter,
904	/// which is a 64-bit integer vector of [2 x i32], by the number of bits
905	/// specified by the second parameter, which is a 64-bit integer.
906	///
907	/// High-order bits are filled with the sign bit of the initial value of each
908	/// 32-bit element. The 32-bit results are packed into a 64-bit integer
909	/// vector of [2 x i32].
910	///
911	/// \headerfile <x86intrin.h>
912	///
913	/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
914	///
915	/// \param __m
916	/// A 64-bit integer vector of [2 x i32].
917	/// \param __count
918	/// A 64-bit integer vector interpreted as a single 64-bit integer.
919	/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
920	/// values.
921	static __inline__ __m64 __DEFAULT_FN_ATTRS
922	_mm_sra_pi32(__m64 __m, __m64 __count)
923	{
924	return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
925	}
926
927	/// Right-shifts each 32-bit integer element of a 64-bit integer vector
928	/// of [2 x i32] by the number of bits specified by a 32-bit integer.
929	///
930	/// High-order bits are filled with the sign bit of the initial value of each
931	/// 32-bit element. The 32-bit results are packed into a 64-bit integer
932	/// vector of [2 x i32].
933	///
934	/// \headerfile <x86intrin.h>
935	///
936	/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
937	///
938	/// \param __m
939	/// A 64-bit integer vector of [2 x i32].
940	/// \param __count
941	/// A 32-bit integer value.
942	/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
943	/// values.
944	static __inline__ __m64 __DEFAULT_FN_ATTRS
945	_mm_srai_pi32(__m64 __m, int __count)
946	{
947	return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
948	}
949
950	/// Right-shifts each 16-bit integer element of the first parameter,
951	/// which is a 64-bit integer vector of [4 x i16], by the number of bits
952	/// specified by the second parameter, which is a 64-bit integer.
953	///
954	/// High-order bits are cleared. The 16-bit results are packed into a 64-bit
955	/// integer vector of [4 x i16].
956	///
957	/// \headerfile <x86intrin.h>
958	///
959	/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
960	///
961	/// \param __m
962	/// A 64-bit integer vector of [4 x i16].
963	/// \param __count
964	/// A 64-bit integer vector interpreted as a single 64-bit integer.
965	/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
966	/// values.
967	static __inline__ __m64 __DEFAULT_FN_ATTRS
968	_mm_srl_pi16(__m64 __m, __m64 __count)
969	{
970	return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
971	}
972
973	/// Right-shifts each 16-bit integer element of a 64-bit integer vector
974	/// of [4 x i16] by the number of bits specified by a 32-bit integer.
975	///
976	/// High-order bits are cleared. The 16-bit results are packed into a 64-bit
977	/// integer vector of [4 x i16].
978	///
979	/// \headerfile <x86intrin.h>
980	///
981	/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
982	///
983	/// \param __m
984	/// A 64-bit integer vector of [4 x i16].
985	/// \param __count
986	/// A 32-bit integer value.
987	/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
988	/// values.
989	static __inline__ __m64 __DEFAULT_FN_ATTRS
990	_mm_srli_pi16(__m64 __m, int __count)
991	{
992	return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
993	}
994
995	/// Right-shifts each 32-bit integer element of the first parameter,
996	/// which is a 64-bit integer vector of [2 x i32], by the number of bits
997	/// specified by the second parameter, which is a 64-bit integer.
998	///
999	/// High-order bits are cleared. The 32-bit results are packed into a 64-bit
1000	/// integer vector of [2 x i32].
1001	///
1002	/// \headerfile <x86intrin.h>
1003	///
1004	/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
1005	///
1006	/// \param __m
1007	/// A 64-bit integer vector of [2 x i32].
1008	/// \param __count
1009	/// A 64-bit integer vector interpreted as a single 64-bit integer.
1010	/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
1011	/// values.
1012	static __inline__ __m64 __DEFAULT_FN_ATTRS
1013	_mm_srl_pi32(__m64 __m, __m64 __count)
1014	{
1015	return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
1016	}
1017
1018	/// Right-shifts each 32-bit integer element of a 64-bit integer vector
1019	/// of [2 x i32] by the number of bits specified by a 32-bit integer.
1020	///
1021	/// High-order bits are cleared. The 32-bit results are packed into a 64-bit
1022	/// integer vector of [2 x i32].
1023	///
1024	/// \headerfile <x86intrin.h>
1025	///
1026	/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
1027	///
1028	/// \param __m
1029	/// A 64-bit integer vector of [2 x i32].
1030	/// \param __count
1031	/// A 32-bit integer value.
1032	/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
1033	/// values.
1034	static __inline__ __m64 __DEFAULT_FN_ATTRS
1035	_mm_srli_pi32(__m64 __m, int __count)
1036	{
1037	return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
1038	}
1039
1040	/// Right-shifts the first 64-bit integer parameter by the number of bits
1041	/// specified by the second 64-bit integer parameter.
1042	///
1043	/// High-order bits are cleared.
1044	///
1045	/// \headerfile <x86intrin.h>
1046	///
1047	/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
1048	///
1049	/// \param __m
1050	/// A 64-bit integer vector interpreted as a single 64-bit integer.
1051	/// \param __count
1052	/// A 64-bit integer vector interpreted as a single 64-bit integer.
1053	/// \returns A 64-bit integer vector containing the right-shifted value.
1054	static __inline__ __m64 __DEFAULT_FN_ATTRS
1055	_mm_srl_si64(__m64 __m, __m64 __count)
1056	{
1057	return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
1058	}
1059
1060	/// Right-shifts the first parameter, which is a 64-bit integer, by the
1061	/// number of bits specified by the second parameter, which is a 32-bit
1062	/// integer.
1063	///
1064	/// High-order bits are cleared.
1065	///
1066	/// \headerfile <x86intrin.h>
1067	///
1068	/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
1069	///
1070	/// \param __m
1071	/// A 64-bit integer vector interpreted as a single 64-bit integer.
1072	/// \param __count
1073	/// A 32-bit integer value.
1074	/// \returns A 64-bit integer vector containing the right-shifted value.
1075	static __inline__ __m64 __DEFAULT_FN_ATTRS
1076	_mm_srli_si64(__m64 __m, int __count)
1077	{
1078	return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
1079	}
1080
1081	/// Performs a bitwise AND of two 64-bit integer vectors.
1082	///
1083	/// \headerfile <x86intrin.h>
1084	///
1085	/// This intrinsic corresponds to the <c> PAND </c> instruction.
1086	///
1087	/// \param __m1
1088	/// A 64-bit integer vector.
1089	/// \param __m2
1090	/// A 64-bit integer vector.
1091	/// \returns A 64-bit integer vector containing the bitwise AND of both
1092	/// parameters.
1093	static __inline__ __m64 __DEFAULT_FN_ATTRS
1094	_mm_and_si64(__m64 __m1, __m64 __m2)
1095	{
1096	return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
1097	}
1098
1099	/// Performs a bitwise NOT of the first 64-bit integer vector, and then
1100	/// performs a bitwise AND of the intermediate result and the second 64-bit
1101	/// integer vector.
1102	///
1103	/// \headerfile <x86intrin.h>
1104	///
1105	/// This intrinsic corresponds to the <c> PANDN </c> instruction.
1106	///
1107	/// \param __m1
1108	/// A 64-bit integer vector. The one's complement of this parameter is used
1109	/// in the bitwise AND.
1110	/// \param __m2
1111	/// A 64-bit integer vector.
1112	/// \returns A 64-bit integer vector containing the bitwise AND of the second
1113	/// parameter and the one's complement of the first parameter.
1114	static __inline__ __m64 __DEFAULT_FN_ATTRS
1115	_mm_andnot_si64(__m64 __m1, __m64 __m2)
1116	{
1117	return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
1118	}
1119
1120	/// Performs a bitwise OR of two 64-bit integer vectors.
1121	///
1122	/// \headerfile <x86intrin.h>
1123	///
1124	/// This intrinsic corresponds to the <c> POR </c> instruction.
1125	///
1126	/// \param __m1
1127	/// A 64-bit integer vector.
1128	/// \param __m2
1129	/// A 64-bit integer vector.
1130	/// \returns A 64-bit integer vector containing the bitwise OR of both
1131	/// parameters.
1132	static __inline__ __m64 __DEFAULT_FN_ATTRS
1133	_mm_or_si64(__m64 __m1, __m64 __m2)
1134	{
1135	return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
1136	}
1137
1138	/// Performs a bitwise exclusive OR of two 64-bit integer vectors.
1139	///
1140	/// \headerfile <x86intrin.h>
1141	///
1142	/// This intrinsic corresponds to the <c> PXOR </c> instruction.
1143	///
1144	/// \param __m1
1145	/// A 64-bit integer vector.
1146	/// \param __m2
1147	/// A 64-bit integer vector.
1148	/// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
1149	/// parameters.
1150	static __inline__ __m64 __DEFAULT_FN_ATTRS
1151	_mm_xor_si64(__m64 __m1, __m64 __m2)
1152	{
1153	return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
1154	}
1155
1156	/// Compares the 8-bit integer elements of two 64-bit integer vectors of
1157	/// [8 x i8] to determine if the element of the first vector is equal to the
1158	/// corresponding element of the second vector.
1159	///
1160	/// The comparison yields 0 for false, 0xFF for true.
1161	///
1162	/// \headerfile <x86intrin.h>
1163	///
1164	/// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
1165	///
1166	/// \param __m1
1167	/// A 64-bit integer vector of [8 x i8].
1168	/// \param __m2
1169	/// A 64-bit integer vector of [8 x i8].
1170	/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
1171	/// results.
1172	static __inline__ __m64 __DEFAULT_FN_ATTRS
1173	_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
1174	{
1175	return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
1176	}
1177
1178	/// Compares the 16-bit integer elements of two 64-bit integer vectors of
1179	/// [4 x i16] to determine if the element of the first vector is equal to the
1180	/// corresponding element of the second vector.
1181	///
1182	/// The comparison yields 0 for false, 0xFFFF for true.
1183	///
1184	/// \headerfile <x86intrin.h>
1185	///
1186	/// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
1187	///
1188	/// \param __m1
1189	/// A 64-bit integer vector of [4 x i16].
1190	/// \param __m2
1191	/// A 64-bit integer vector of [4 x i16].
1192	/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
1193	/// results.
1194	static __inline__ __m64 __DEFAULT_FN_ATTRS
1195	_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
1196	{
1197	return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
1198	}
1199
1200	/// Compares the 32-bit integer elements of two 64-bit integer vectors of
1201	/// [2 x i32] to determine if the element of the first vector is equal to the
1202	/// corresponding element of the second vector.
1203	///
1204	/// The comparison yields 0 for false, 0xFFFFFFFF for true.
1205	///
1206	/// \headerfile <x86intrin.h>
1207	///
1208	/// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
1209	///
1210	/// \param __m1
1211	/// A 64-bit integer vector of [2 x i32].
1212	/// \param __m2
1213	/// A 64-bit integer vector of [2 x i32].
1214	/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
1215	/// results.
1216	static __inline__ __m64 __DEFAULT_FN_ATTRS
1217	_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
1218	{
1219	return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
1220	}
1221
1222	/// Compares the 8-bit integer elements of two 64-bit integer vectors of
1223	/// [8 x i8] to determine if the element of the first vector is greater than
1224	/// the corresponding element of the second vector.
1225	///
1226	/// The comparison yields 0 for false, 0xFF for true.
1227	///
1228	/// \headerfile <x86intrin.h>
1229	///
1230	/// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
1231	///
1232	/// \param __m1
1233	/// A 64-bit integer vector of [8 x i8].
1234	/// \param __m2
1235	/// A 64-bit integer vector of [8 x i8].
1236	/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
1237	/// results.
1238	static __inline__ __m64 __DEFAULT_FN_ATTRS
1239	_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
1240	{
1241	return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
1242	}
1243
1244	/// Compares the 16-bit integer elements of two 64-bit integer vectors of
1245	/// [4 x i16] to determine if the element of the first vector is greater than
1246	/// the corresponding element of the second vector.
1247	///
1248	/// The comparison yields 0 for false, 0xFFFF for true.
1249	///
1250	/// \headerfile <x86intrin.h>
1251	///
1252	/// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
1253	///
1254	/// \param __m1
1255	/// A 64-bit integer vector of [4 x i16].
1256	/// \param __m2
1257	/// A 64-bit integer vector of [4 x i16].
1258	/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
1259	/// results.
1260	static __inline__ __m64 __DEFAULT_FN_ATTRS
1261	_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
1262	{
1263	return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
1264	}
1265
1266	/// Compares the 32-bit integer elements of two 64-bit integer vectors of
1267	/// [2 x i32] to determine if the element of the first vector is greater than
1268	/// the corresponding element of the second vector.
1269	///
1270	/// The comparison yields 0 for false, 0xFFFFFFFF for true.
1271	///
1272	/// \headerfile <x86intrin.h>
1273	///
1274	/// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
1275	///
1276	/// \param __m1
1277	/// A 64-bit integer vector of [2 x i32].
1278	/// \param __m2
1279	/// A 64-bit integer vector of [2 x i32].
1280	/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
1281	/// results.
1282	static __inline__ __m64 __DEFAULT_FN_ATTRS
1283	_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
1284	{
1285	return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
1286	}
1287
1288	/// Constructs a 64-bit integer vector initialized to zero.
1289	///
1290	/// \headerfile <x86intrin.h>
1291	///
1292	/// This intrinsic corresponds to the <c> PXOR </c> instruction.
1293	///
1294	/// \returns An initialized 64-bit integer vector with all elements set to zero.
1295	static __inline__ __m64 __DEFAULT_FN_ATTRS
1296	_mm_setzero_si64(void)
1297	{
1298	return __extension__ (__m64){ 0LL };
1299	}
1300
1301	/// Constructs a 64-bit integer vector initialized with the specified
1302	/// 32-bit integer values.
1303	///
1304	/// \headerfile <x86intrin.h>
1305	///
1306	/// This intrinsic is a utility function and does not correspond to a specific
1307	/// instruction.
1308	///
1309	/// \param __i1
1310	/// A 32-bit integer value used to initialize the upper 32 bits of the
1311	/// result.
1312	/// \param __i0
1313	/// A 32-bit integer value used to initialize the lower 32 bits of the
1314	/// result.
1315	/// \returns An initialized 64-bit integer vector.
1316	static __inline__ __m64 __DEFAULT_FN_ATTRS
1317	_mm_set_pi32(int __i1, int __i0)
1318	{
1319	return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
1320	}
1321
1322	/// Constructs a 64-bit integer vector initialized with the specified
1323	/// 16-bit integer values.
1324	///
1325	/// \headerfile <x86intrin.h>
1326	///
1327	/// This intrinsic is a utility function and does not correspond to a specific
1328	/// instruction.
1329	///
1330	/// \param __s3
1331	/// A 16-bit integer value used to initialize bits [63:48] of the result.
1332	/// \param __s2
1333	/// A 16-bit integer value used to initialize bits [47:32] of the result.
1334	/// \param __s1
1335	/// A 16-bit integer value used to initialize bits [31:16] of the result.
1336	/// \param __s0
1337	/// A 16-bit integer value used to initialize bits [15:0] of the result.
1338	/// \returns An initialized 64-bit integer vector.
1339	static __inline__ __m64 __DEFAULT_FN_ATTRS
1340	_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
1341	{
1342	return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
1343	}
1344
1345	/// Constructs a 64-bit integer vector initialized with the specified
1346	/// 8-bit integer values.
1347	///
1348	/// \headerfile <x86intrin.h>
1349	///
1350	/// This intrinsic is a utility function and does not correspond to a specific
1351	/// instruction.
1352	///
1353	/// \param __b7
1354	/// An 8-bit integer value used to initialize bits [63:56] of the result.
1355	/// \param __b6
1356	/// An 8-bit integer value used to initialize bits [55:48] of the result.
1357	/// \param __b5
1358	/// An 8-bit integer value used to initialize bits [47:40] of the result.
1359	/// \param __b4
1360	/// An 8-bit integer value used to initialize bits [39:32] of the result.
1361	/// \param __b3
1362	/// An 8-bit integer value used to initialize bits [31:24] of the result.
1363	/// \param __b2
1364	/// An 8-bit integer value used to initialize bits [23:16] of the result.
1365	/// \param __b1
1366	/// An 8-bit integer value used to initialize bits [15:8] of the result.
1367	/// \param __b0
1368	/// An 8-bit integer value used to initialize bits [7:0] of the result.
1369	/// \returns An initialized 64-bit integer vector.
1370	static __inline__ __m64 __DEFAULT_FN_ATTRS
1371	_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
1372	char __b1, char __b0)
1373	{
1374	return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
1375	__b4, __b5, __b6, __b7);
1376	}
1377
1378	/// Constructs a 64-bit integer vector of [2 x i32], with each of the
1379	/// 32-bit integer vector elements set to the specified 32-bit integer
1380	/// value.
1381	///
1382	/// \headerfile <x86intrin.h>
1383	///
1384	/// This intrinsic is a utility function and does not correspond to a specific
1385	/// instruction.
1386	///
1387	/// \param __i
1388	/// A 32-bit integer value used to initialize each vector element of the
1389	/// result.
1390	/// \returns An initialized 64-bit integer vector of [2 x i32].
1391	static __inline__ __m64 __DEFAULT_FN_ATTRS
1392	_mm_set1_pi32(int __i)
1393	{
1394	return _mm_set_pi32(__i, __i);
1395	}
1396
1397	/// Constructs a 64-bit integer vector of [4 x i16], with each of the
1398	/// 16-bit integer vector elements set to the specified 16-bit integer
1399	/// value.
1400	///
1401	/// \headerfile <x86intrin.h>
1402	///
1403	/// This intrinsic is a utility function and does not correspond to a specific
1404	/// instruction.
1405	///
1406	/// \param __w
1407	/// A 16-bit integer value used to initialize each vector element of the
1408	/// result.
1409	/// \returns An initialized 64-bit integer vector of [4 x i16].
1410	static __inline__ __m64 __DEFAULT_FN_ATTRS
1411	_mm_set1_pi16(short __w)
1412	{
1413	return _mm_set_pi16(__w, __w, __w, __w);
1414	}
1415
1416	/// Constructs a 64-bit integer vector of [8 x i8], with each of the
1417	/// 8-bit integer vector elements set to the specified 8-bit integer value.
1418	///
1419	/// \headerfile <x86intrin.h>
1420	///
1421	/// This intrinsic is a utility function and does not correspond to a specific
1422	/// instruction.
1423	///
1424	/// \param __b
1425	/// An 8-bit integer value used to initialize each vector element of the
1426	/// result.
1427	/// \returns An initialized 64-bit integer vector of [8 x i8].
1428	static __inline__ __m64 __DEFAULT_FN_ATTRS
1429	_mm_set1_pi8(char __b)
1430	{
1431	return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
1432	}
1433
1434	/// Constructs a 64-bit integer vector, initialized in reverse order with
1435	/// the specified 32-bit integer values.
1436	///
1437	/// \headerfile <x86intrin.h>
1438	///
1439	/// This intrinsic is a utility function and does not correspond to a specific
1440	/// instruction.
1441	///
1442	/// \param __i0
1443	/// A 32-bit integer value used to initialize the lower 32 bits of the
1444	/// result.
1445	/// \param __i1
1446	/// A 32-bit integer value used to initialize the upper 32 bits of the
1447	/// result.
1448	/// \returns An initialized 64-bit integer vector.
1449	static __inline__ __m64 __DEFAULT_FN_ATTRS
1450	_mm_setr_pi32(int __i0, int __i1)
1451	{
1452	return _mm_set_pi32(__i1, __i0);
1453	}
1454
1455	/// Constructs a 64-bit integer vector, initialized in reverse order with
1456	/// the specified 16-bit integer values.
1457	///
1458	/// \headerfile <x86intrin.h>
1459	///
1460	/// This intrinsic is a utility function and does not correspond to a specific
1461	/// instruction.
1462	///
1463	/// \param __w0
1464	/// A 16-bit integer value used to initialize bits [15:0] of the result.
1465	/// \param __w1
1466	/// A 16-bit integer value used to initialize bits [31:16] of the result.
1467	/// \param __w2
1468	/// A 16-bit integer value used to initialize bits [47:32] of the result.
1469	/// \param __w3
1470	/// A 16-bit integer value used to initialize bits [63:48] of the result.
1471	/// \returns An initialized 64-bit integer vector.
1472	static __inline__ __m64 __DEFAULT_FN_ATTRS
1473	_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
1474	{
1475	return _mm_set_pi16(__w3, __w2, __w1, __w0);
1476	}
1477
1478	/// Constructs a 64-bit integer vector, initialized in reverse order with
1479	/// the specified 8-bit integer values.
1480	///
1481	/// \headerfile <x86intrin.h>
1482	///
1483	/// This intrinsic is a utility function and does not correspond to a specific
1484	/// instruction.
1485	///
1486	/// \param __b0
1487	/// An 8-bit integer value used to initialize bits [7:0] of the result.
1488	/// \param __b1
1489	/// An 8-bit integer value used to initialize bits [15:8] of the result.
1490	/// \param __b2
1491	/// An 8-bit integer value used to initialize bits [23:16] of the result.
1492	/// \param __b3
1493	/// An 8-bit integer value used to initialize bits [31:24] of the result.
1494	/// \param __b4
1495	/// An 8-bit integer value used to initialize bits [39:32] of the result.
1496	/// \param __b5
1497	/// An 8-bit integer value used to initialize bits [47:40] of the result.
1498	/// \param __b6
1499	/// An 8-bit integer value used to initialize bits [55:48] of the result.
1500	/// \param __b7
1501	/// An 8-bit integer value used to initialize bits [63:56] of the result.
1502	/// \returns An initialized 64-bit integer vector.
1503	static __inline__ __m64 __DEFAULT_FN_ATTRS
1504	_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
1505	char __b6, char __b7)
1506	{
1507	return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1508	}
1509
1510	#undef __DEFAULT_FN_ATTRS
1511
1512	/* Aliases for compatibility. */
1513	#define _m_empty _mm_empty
1514	#define _m_from_int _mm_cvtsi32_si64
1515	#define _m_from_int64 _mm_cvtsi64_m64
1516	#define _m_to_int _mm_cvtsi64_si32
1517	#define _m_to_int64 _mm_cvtm64_si64
1518	#define _m_packsswb _mm_packs_pi16
1519	#define _m_packssdw _mm_packs_pi32
1520	#define _m_packuswb _mm_packs_pu16
1521	#define _m_punpckhbw _mm_unpackhi_pi8
1522	#define _m_punpckhwd _mm_unpackhi_pi16
1523	#define _m_punpckhdq _mm_unpackhi_pi32
1524	#define _m_punpcklbw _mm_unpacklo_pi8
1525	#define _m_punpcklwd _mm_unpacklo_pi16
1526	#define _m_punpckldq _mm_unpacklo_pi32
1527	#define _m_paddb _mm_add_pi8
1528	#define _m_paddw _mm_add_pi16
1529	#define _m_paddd _mm_add_pi32
1530	#define _m_paddsb _mm_adds_pi8
1531	#define _m_paddsw _mm_adds_pi16
1532	#define _m_paddusb _mm_adds_pu8
1533	#define _m_paddusw _mm_adds_pu16
1534	#define _m_psubb _mm_sub_pi8
1535	#define _m_psubw _mm_sub_pi16
1536	#define _m_psubd _mm_sub_pi32
1537	#define _m_psubsb _mm_subs_pi8
1538	#define _m_psubsw _mm_subs_pi16
1539	#define _m_psubusb _mm_subs_pu8
1540	#define _m_psubusw _mm_subs_pu16
1541	#define _m_pmaddwd _mm_madd_pi16
1542	#define _m_pmulhw _mm_mulhi_pi16
1543	#define _m_pmullw _mm_mullo_pi16
1544	#define _m_psllw _mm_sll_pi16
1545	#define _m_psllwi _mm_slli_pi16
1546	#define _m_pslld _mm_sll_pi32
1547	#define _m_pslldi _mm_slli_pi32
1548	#define _m_psllq _mm_sll_si64
1549	#define _m_psllqi _mm_slli_si64
1550	#define _m_psraw _mm_sra_pi16
1551	#define _m_psrawi _mm_srai_pi16
1552	#define _m_psrad _mm_sra_pi32
1553	#define _m_psradi _mm_srai_pi32
1554	#define _m_psrlw _mm_srl_pi16
1555	#define _m_psrlwi _mm_srli_pi16
1556	#define _m_psrld _mm_srl_pi32
1557	#define _m_psrldi _mm_srli_pi32
1558	#define _m_psrlq _mm_srl_si64
1559	#define _m_psrlqi _mm_srli_si64
1560	#define _m_pand _mm_and_si64
1561	#define _m_pandn _mm_andnot_si64
1562	#define _m_por _mm_or_si64
1563	#define _m_pxor _mm_xor_si64
1564	#define _m_pcmpeqb _mm_cmpeq_pi8
1565	#define _m_pcmpeqw _mm_cmpeq_pi16
1566	#define _m_pcmpeqd _mm_cmpeq_pi32
1567	#define _m_pcmpgtb _mm_cmpgt_pi8
1568	#define _m_pcmpgtw _mm_cmpgt_pi16
1569	#define _m_pcmpgtd _mm_cmpgt_pi32
1570
1571	#endif /* __MMINTRIN_H */
1572
1573

Clang Project