1 | /*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------=== |
---|---|
2 | * |
3 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
4 | * of this software and associated documentation files (the "Software"), to deal |
5 | * in the Software without restriction, including without limitation the rights |
6 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
7 | * copies of the Software, and to permit persons to whom the Software is |
8 | * furnished to do so, subject to the following conditions: |
9 | * |
10 | * The above copyright notice and this permission notice shall be included in |
11 | * all copies or substantial portions of the Software. |
12 | * |
13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
16 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
17 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
18 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
19 | * THE SOFTWARE. |
20 | * |
21 | *===-----------------------------------------------------------------------=== |
22 | */ |
23 | |
24 | #ifndef __PMMINTRIN_H |
25 | #define __PMMINTRIN_H |
26 | |
27 | #include <emmintrin.h> |
28 | |
29 | /* Define the default attributes for the functions in this file. */ |
30 | #define __DEFAULT_FN_ATTRS \ |
31 | __attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128))) |
32 | |
33 | /// Loads data from an unaligned memory location to elements in a 128-bit |
34 | /// vector. |
35 | /// |
36 | /// If the address of the data is not 16-byte aligned, the instruction may |
37 | /// read two adjacent aligned blocks of memory to retrieve the requested |
38 | /// data. |
39 | /// |
40 | /// \headerfile <x86intrin.h> |
41 | /// |
42 | /// This intrinsic corresponds to the <c> VLDDQU </c> instruction. |
43 | /// |
44 | /// \param __p |
45 | /// A pointer to a 128-bit integer vector containing integer values. |
46 | /// \returns A 128-bit vector containing the moved values. |
47 | static __inline__ __m128i __DEFAULT_FN_ATTRS |
48 | _mm_lddqu_si128(__m128i const *__p) |
49 | { |
50 | return (__m128i)__builtin_ia32_lddqu((char const *)__p); |
51 | } |
52 | |
53 | /// Adds the even-indexed values and subtracts the odd-indexed values of |
54 | /// two 128-bit vectors of [4 x float]. |
55 | /// |
56 | /// \headerfile <x86intrin.h> |
57 | /// |
58 | /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. |
59 | /// |
60 | /// \param __a |
61 | /// A 128-bit vector of [4 x float] containing the left source operand. |
62 | /// \param __b |
63 | /// A 128-bit vector of [4 x float] containing the right source operand. |
64 | /// \returns A 128-bit vector of [4 x float] containing the alternating sums and |
65 | /// differences of both operands. |
66 | static __inline__ __m128 __DEFAULT_FN_ATTRS |
67 | _mm_addsub_ps(__m128 __a, __m128 __b) |
68 | { |
69 | return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b); |
70 | } |
71 | |
72 | /// Horizontally adds the adjacent pairs of values contained in two |
73 | /// 128-bit vectors of [4 x float]. |
74 | /// |
75 | /// \headerfile <x86intrin.h> |
76 | /// |
77 | /// This intrinsic corresponds to the <c> VHADDPS </c> instruction. |
78 | /// |
79 | /// \param __a |
80 | /// A 128-bit vector of [4 x float] containing one of the source operands. |
81 | /// The horizontal sums of the values are stored in the lower bits of the |
82 | /// destination. |
83 | /// \param __b |
84 | /// A 128-bit vector of [4 x float] containing one of the source operands. |
85 | /// The horizontal sums of the values are stored in the upper bits of the |
86 | /// destination. |
87 | /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of |
88 | /// both operands. |
89 | static __inline__ __m128 __DEFAULT_FN_ATTRS |
90 | _mm_hadd_ps(__m128 __a, __m128 __b) |
91 | { |
92 | return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b); |
93 | } |
94 | |
95 | /// Horizontally subtracts the adjacent pairs of values contained in two |
96 | /// 128-bit vectors of [4 x float]. |
97 | /// |
98 | /// \headerfile <x86intrin.h> |
99 | /// |
100 | /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. |
101 | /// |
102 | /// \param __a |
103 | /// A 128-bit vector of [4 x float] containing one of the source operands. |
104 | /// The horizontal differences between the values are stored in the lower |
105 | /// bits of the destination. |
106 | /// \param __b |
107 | /// A 128-bit vector of [4 x float] containing one of the source operands. |
108 | /// The horizontal differences between the values are stored in the upper |
109 | /// bits of the destination. |
110 | /// \returns A 128-bit vector of [4 x float] containing the horizontal |
111 | /// differences of both operands. |
112 | static __inline__ __m128 __DEFAULT_FN_ATTRS |
113 | _mm_hsub_ps(__m128 __a, __m128 __b) |
114 | { |
115 | return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b); |
116 | } |
117 | |
118 | /// Moves and duplicates odd-indexed values from a 128-bit vector |
119 | /// of [4 x float] to float values stored in a 128-bit vector of |
120 | /// [4 x float]. |
121 | /// |
122 | /// \headerfile <x86intrin.h> |
123 | /// |
124 | /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. |
125 | /// |
126 | /// \param __a |
127 | /// A 128-bit vector of [4 x float]. \n |
128 | /// Bits [127:96] of the source are written to bits [127:96] and [95:64] of |
129 | /// the destination. \n |
130 | /// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the |
131 | /// destination. |
132 | /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated |
133 | /// values. |
134 | static __inline__ __m128 __DEFAULT_FN_ATTRS |
135 | _mm_movehdup_ps(__m128 __a) |
136 | { |
137 | return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3); |
138 | } |
139 | |
140 | /// Duplicates even-indexed values from a 128-bit vector of |
141 | /// [4 x float] to float values stored in a 128-bit vector of [4 x float]. |
142 | /// |
143 | /// \headerfile <x86intrin.h> |
144 | /// |
145 | /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. |
146 | /// |
147 | /// \param __a |
148 | /// A 128-bit vector of [4 x float] \n |
149 | /// Bits [95:64] of the source are written to bits [127:96] and [95:64] of |
150 | /// the destination. \n |
151 | /// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the |
152 | /// destination. |
153 | /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated |
154 | /// values. |
155 | static __inline__ __m128 __DEFAULT_FN_ATTRS |
156 | _mm_moveldup_ps(__m128 __a) |
157 | { |
158 | return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2); |
159 | } |
160 | |
161 | /// Adds the even-indexed values and subtracts the odd-indexed values of |
162 | /// two 128-bit vectors of [2 x double]. |
163 | /// |
164 | /// \headerfile <x86intrin.h> |
165 | /// |
166 | /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. |
167 | /// |
168 | /// \param __a |
169 | /// A 128-bit vector of [2 x double] containing the left source operand. |
170 | /// \param __b |
171 | /// A 128-bit vector of [2 x double] containing the right source operand. |
172 | /// \returns A 128-bit vector of [2 x double] containing the alternating sums |
173 | /// and differences of both operands. |
174 | static __inline__ __m128d __DEFAULT_FN_ATTRS |
175 | _mm_addsub_pd(__m128d __a, __m128d __b) |
176 | { |
177 | return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b); |
178 | } |
179 | |
180 | /// Horizontally adds the pairs of values contained in two 128-bit |
181 | /// vectors of [2 x double]. |
182 | /// |
183 | /// \headerfile <x86intrin.h> |
184 | /// |
185 | /// This intrinsic corresponds to the <c> VHADDPD </c> instruction. |
186 | /// |
187 | /// \param __a |
188 | /// A 128-bit vector of [2 x double] containing one of the source operands. |
189 | /// The horizontal sum of the values is stored in the lower bits of the |
190 | /// destination. |
191 | /// \param __b |
192 | /// A 128-bit vector of [2 x double] containing one of the source operands. |
193 | /// The horizontal sum of the values is stored in the upper bits of the |
194 | /// destination. |
195 | /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of |
196 | /// both operands. |
197 | static __inline__ __m128d __DEFAULT_FN_ATTRS |
198 | _mm_hadd_pd(__m128d __a, __m128d __b) |
199 | { |
200 | return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b); |
201 | } |
202 | |
203 | /// Horizontally subtracts the pairs of values contained in two 128-bit |
204 | /// vectors of [2 x double]. |
205 | /// |
206 | /// \headerfile <x86intrin.h> |
207 | /// |
208 | /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. |
209 | /// |
210 | /// \param __a |
211 | /// A 128-bit vector of [2 x double] containing one of the source operands. |
212 | /// The horizontal difference of the values is stored in the lower bits of |
213 | /// the destination. |
214 | /// \param __b |
215 | /// A 128-bit vector of [2 x double] containing one of the source operands. |
216 | /// The horizontal difference of the values is stored in the upper bits of |
217 | /// the destination. |
218 | /// \returns A 128-bit vector of [2 x double] containing the horizontal |
219 | /// differences of both operands. |
220 | static __inline__ __m128d __DEFAULT_FN_ATTRS |
221 | _mm_hsub_pd(__m128d __a, __m128d __b) |
222 | { |
223 | return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b); |
224 | } |
225 | |
226 | /// Moves and duplicates one double-precision value to double-precision |
227 | /// values stored in a 128-bit vector of [2 x double]. |
228 | /// |
229 | /// \headerfile <x86intrin.h> |
230 | /// |
231 | /// \code |
232 | /// __m128d _mm_loaddup_pd(double const *dp); |
233 | /// \endcode |
234 | /// |
235 | /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. |
236 | /// |
237 | /// \param dp |
238 | /// A pointer to a double-precision value to be moved and duplicated. |
239 | /// \returns A 128-bit vector of [2 x double] containing the moved and |
240 | /// duplicated values. |
241 | #define _mm_loaddup_pd(dp) _mm_load1_pd(dp) |
242 | |
243 | /// Moves and duplicates the double-precision value in the lower bits of |
244 | /// a 128-bit vector of [2 x double] to double-precision values stored in a |
245 | /// 128-bit vector of [2 x double]. |
246 | /// |
247 | /// \headerfile <x86intrin.h> |
248 | /// |
249 | /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. |
250 | /// |
251 | /// \param __a |
252 | /// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits |
253 | /// [127:64] and [63:0] of the destination. |
254 | /// \returns A 128-bit vector of [2 x double] containing the moved and |
255 | /// duplicated values. |
256 | static __inline__ __m128d __DEFAULT_FN_ATTRS |
257 | _mm_movedup_pd(__m128d __a) |
258 | { |
259 | return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); |
260 | } |
261 | |
262 | /// Establishes a linear address memory range to be monitored and puts |
263 | /// the processor in the monitor event pending state. Data stored in the |
264 | /// monitored address range causes the processor to exit the pending state. |
265 | /// |
266 | /// \headerfile <x86intrin.h> |
267 | /// |
268 | /// This intrinsic corresponds to the <c> MONITOR </c> instruction. |
269 | /// |
270 | /// \param __p |
271 | /// The memory range to be monitored. The size of the range is determined by |
272 | /// CPUID function 0000_0005h. |
273 | /// \param __extensions |
274 | /// Optional extensions for the monitoring state. |
275 | /// \param __hints |
276 | /// Optional hints for the monitoring state. |
277 | static __inline__ void __DEFAULT_FN_ATTRS |
278 | _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints) |
279 | { |
280 | __builtin_ia32_monitor((void *)__p, __extensions, __hints); |
281 | } |
282 | |
283 | /// Used with the MONITOR instruction to wait while the processor is in |
284 | /// the monitor event pending state. Data stored in the monitored address |
285 | /// range causes the processor to exit the pending state. |
286 | /// |
287 | /// \headerfile <x86intrin.h> |
288 | /// |
289 | /// This intrinsic corresponds to the <c> MWAIT </c> instruction. |
290 | /// |
291 | /// \param __extensions |
292 | /// Optional extensions for the monitoring state, which may vary by |
293 | /// processor. |
294 | /// \param __hints |
295 | /// Optional hints for the monitoring state, which may vary by processor. |
296 | static __inline__ void __DEFAULT_FN_ATTRS |
297 | _mm_mwait(unsigned __extensions, unsigned __hints) |
298 | { |
299 | __builtin_ia32_mwait(__extensions, __hints); |
300 | } |
301 | |
302 | #undef __DEFAULT_FN_ATTRS |
303 | |
304 | #endif /* __PMMINTRIN_H */ |
305 |