1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | |
15 | |
16 | |
17 | |
18 | |
19 | |
20 | |
21 | |
22 | |
23 | |
24 | #ifndef __X86INTRIN_H |
25 | #error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead." |
26 | #endif |
27 | |
28 | #ifndef __FMA4INTRIN_H |
29 | #define __FMA4INTRIN_H |
30 | |
31 | #include <pmmintrin.h> |
32 | |
33 | |
34 | #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128))) |
35 | #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256))) |
36 | |
37 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
38 | _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C) |
39 | { |
40 | return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); |
41 | } |
42 | |
43 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
44 | _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) |
45 | { |
46 | return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); |
47 | } |
48 | |
49 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
50 | _mm_macc_ss(__m128 __A, __m128 __B, __m128 __C) |
51 | { |
52 | return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); |
53 | } |
54 | |
55 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
56 | _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) |
57 | { |
58 | return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C); |
59 | } |
60 | |
61 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
62 | _mm_msub_ps(__m128 __A, __m128 __B, __m128 __C) |
63 | { |
64 | return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); |
65 | } |
66 | |
67 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
68 | _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) |
69 | { |
70 | return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); |
71 | } |
72 | |
73 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
74 | _mm_msub_ss(__m128 __A, __m128 __B, __m128 __C) |
75 | { |
76 | return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); |
77 | } |
78 | |
79 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
80 | _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) |
81 | { |
82 | return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C); |
83 | } |
84 | |
85 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
86 | _mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C) |
87 | { |
88 | return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); |
89 | } |
90 | |
91 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
92 | _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) |
93 | { |
94 | return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); |
95 | } |
96 | |
97 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
98 | _mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C) |
99 | { |
100 | return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); |
101 | } |
102 | |
103 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
104 | _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) |
105 | { |
106 | return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); |
107 | } |
108 | |
109 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
110 | _mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C) |
111 | { |
112 | return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); |
113 | } |
114 | |
115 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
116 | _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) |
117 | { |
118 | return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); |
119 | } |
120 | |
121 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
122 | _mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C) |
123 | { |
124 | return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); |
125 | } |
126 | |
127 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
128 | _mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C) |
129 | { |
130 | return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); |
131 | } |
132 | |
133 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
134 | _mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C) |
135 | { |
136 | return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); |
137 | } |
138 | |
139 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
140 | _mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C) |
141 | { |
142 | return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); |
143 | } |
144 | |
145 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
146 | _mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C) |
147 | { |
148 | return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); |
149 | } |
150 | |
151 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
152 | _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C) |
153 | { |
154 | return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); |
155 | } |
156 | |
157 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
158 | _mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C) |
159 | { |
160 | return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); |
161 | } |
162 | |
163 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
164 | _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C) |
165 | { |
166 | return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); |
167 | } |
168 | |
169 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
170 | _mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C) |
171 | { |
172 | return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); |
173 | } |
174 | |
175 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
176 | _mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C) |
177 | { |
178 | return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); |
179 | } |
180 | |
181 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
182 | _mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C) |
183 | { |
184 | return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); |
185 | } |
186 | |
187 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
188 | _mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C) |
189 | { |
190 | return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C); |
191 | } |
192 | |
193 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
194 | _mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C) |
195 | { |
196 | return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); |
197 | } |
198 | |
199 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
200 | _mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C) |
201 | { |
202 | return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C); |
203 | } |
204 | |
205 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
206 | _mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C) |
207 | { |
208 | return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); |
209 | } |
210 | |
211 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
212 | _mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C) |
213 | { |
214 | return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); |
215 | } |
216 | |
217 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
218 | _mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C) |
219 | { |
220 | return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); |
221 | } |
222 | |
223 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
224 | _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C) |
225 | { |
226 | return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); |
227 | } |
228 | |
229 | #undef __DEFAULT_FN_ATTRS128 |
230 | #undef __DEFAULT_FN_ATTRS256 |
231 | |
232 | #endif |
233 | |