1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | |
15 | |
16 | |
17 | |
18 | |
19 | |
20 | |
21 | |
22 | |
23 | #ifndef __CLANG_CUDA_INTRINSICS_H__ |
24 | #define __CLANG_CUDA_INTRINSICS_H__ |
25 | #ifndef __CUDA__ |
26 | #error "This file is for CUDA compilation only." |
27 | #endif |
28 | |
29 | |
30 | |
31 | #define __SM_30_INTRINSICS_H__ |
32 | #define __SM_30_INTRINSICS_HPP__ |
33 | |
34 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 |
35 | |
36 | #pragma push_macro("__MAKE_SHUFFLES") |
37 | #define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \ |
38 | __Type) \ |
39 | inline __device__ int __FnName(int __val, __Type __offset, \ |
40 | int __width = warpSize) { \ |
41 | return __IntIntrinsic(__val, __offset, \ |
42 | ((warpSize - __width) << 8) | (__Mask)); \ |
43 | } \ |
44 | inline __device__ float __FnName(float __val, __Type __offset, \ |
45 | int __width = warpSize) { \ |
46 | return __FloatIntrinsic(__val, __offset, \ |
47 | ((warpSize - __width) << 8) | (__Mask)); \ |
48 | } \ |
49 | inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \ |
50 | int __width = warpSize) { \ |
51 | return static_cast<unsigned int>( \ |
52 | ::__FnName(static_cast<int>(__val), __offset, __width)); \ |
53 | } \ |
54 | inline __device__ long long __FnName(long long __val, __Type __offset, \ |
55 | int __width = warpSize) { \ |
56 | struct __Bits { \ |
57 | int __a, __b; \ |
58 | }; \ |
59 | _Static_assert(sizeof(__val) == sizeof(__Bits)); \ |
60 | _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ |
61 | __Bits __tmp; \ |
62 | memcpy(&__val, &__tmp, sizeof(__val)); \ |
63 | __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \ |
64 | __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \ |
65 | long long __ret; \ |
66 | memcpy(&__ret, &__tmp, sizeof(__tmp)); \ |
67 | return __ret; \ |
68 | } \ |
69 | inline __device__ long __FnName(long __val, __Type __offset, \ |
70 | int __width = warpSize) { \ |
71 | _Static_assert(sizeof(long) == sizeof(long long) || \ |
72 | sizeof(long) == sizeof(int)); \ |
73 | if (sizeof(long) == sizeof(long long)) { \ |
74 | return static_cast<long>( \ |
75 | ::__FnName(static_cast<long long>(__val), __offset, __width)); \ |
76 | } else if (sizeof(long) == sizeof(int)) { \ |
77 | return static_cast<long>( \ |
78 | ::__FnName(static_cast<int>(__val), __offset, __width)); \ |
79 | } \ |
80 | } \ |
81 | inline __device__ unsigned long __FnName( \ |
82 | unsigned long __val, __Type __offset, int __width = warpSize) { \ |
83 | return static_cast<unsigned long>( \ |
84 | ::__FnName(static_cast<long>(__val), __offset, __width)); \ |
85 | } \ |
86 | inline __device__ unsigned long long __FnName( \ |
87 | unsigned long long __val, __Type __offset, int __width = warpSize) { \ |
88 | return static_cast<unsigned long long>(::__FnName( \ |
89 | static_cast<unsigned long long>(__val), __offset, __width)); \ |
90 | } \ |
91 | inline __device__ double __FnName(double __val, __Type __offset, \ |
92 | int __width = warpSize) { \ |
93 | long long __tmp; \ |
94 | _Static_assert(sizeof(__tmp) == sizeof(__val)); \ |
95 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
96 | __tmp = ::__FnName(__tmp, __offset, __width); \ |
97 | double __ret; \ |
98 | memcpy(&__ret, &__tmp, sizeof(__ret)); \ |
99 | return __ret; \ |
100 | } |
101 | |
102 | __MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int); |
103 | |
104 | |
105 | __MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0, |
106 | unsigned int); |
107 | __MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f, |
108 | unsigned int); |
109 | __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f, |
110 | int); |
111 | #pragma pop_macro("__MAKE_SHUFFLES") |
112 | |
113 | #endif |
114 | |
115 | #if CUDA_VERSION >= 9000 |
116 | #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) |
117 | |
118 | #pragma push_macro("__MAKE_SYNC_SHUFFLES") |
119 | #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ |
120 | __Mask, __Type) \ |
121 | inline __device__ int __FnName(unsigned int __mask, int __val, \ |
122 | __Type __offset, int __width = warpSize) { \ |
123 | return __IntIntrinsic(__mask, __val, __offset, \ |
124 | ((warpSize - __width) << 8) | (__Mask)); \ |
125 | } \ |
126 | inline __device__ float __FnName(unsigned int __mask, float __val, \ |
127 | __Type __offset, int __width = warpSize) { \ |
128 | return __FloatIntrinsic(__mask, __val, __offset, \ |
129 | ((warpSize - __width) << 8) | (__Mask)); \ |
130 | } \ |
131 | inline __device__ unsigned int __FnName(unsigned int __mask, \ |
132 | unsigned int __val, __Type __offset, \ |
133 | int __width = warpSize) { \ |
134 | return static_cast<unsigned int>( \ |
135 | ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ |
136 | } \ |
137 | inline __device__ long long __FnName(unsigned int __mask, long long __val, \ |
138 | __Type __offset, \ |
139 | int __width = warpSize) { \ |
140 | struct __Bits { \ |
141 | int __a, __b; \ |
142 | }; \ |
143 | _Static_assert(sizeof(__val) == sizeof(__Bits)); \ |
144 | _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ |
145 | __Bits __tmp; \ |
146 | memcpy(&__val, &__tmp, sizeof(__val)); \ |
147 | __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \ |
148 | __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \ |
149 | long long __ret; \ |
150 | memcpy(&__ret, &__tmp, sizeof(__tmp)); \ |
151 | return __ret; \ |
152 | } \ |
153 | inline __device__ unsigned long long __FnName( \ |
154 | unsigned int __mask, unsigned long long __val, __Type __offset, \ |
155 | int __width = warpSize) { \ |
156 | return static_cast<unsigned long long>(::__FnName( \ |
157 | __mask, static_cast<unsigned long long>(__val), __offset, __width)); \ |
158 | } \ |
159 | inline __device__ long __FnName(unsigned int __mask, long __val, \ |
160 | __Type __offset, int __width = warpSize) { \ |
161 | _Static_assert(sizeof(long) == sizeof(long long) || \ |
162 | sizeof(long) == sizeof(int)); \ |
163 | if (sizeof(long) == sizeof(long long)) { \ |
164 | return static_cast<long>(::__FnName( \ |
165 | __mask, static_cast<long long>(__val), __offset, __width)); \ |
166 | } else if (sizeof(long) == sizeof(int)) { \ |
167 | return static_cast<long>( \ |
168 | ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ |
169 | } \ |
170 | } \ |
171 | inline __device__ unsigned long __FnName( \ |
172 | unsigned int __mask, unsigned long __val, __Type __offset, \ |
173 | int __width = warpSize) { \ |
174 | return static_cast<unsigned long>( \ |
175 | ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \ |
176 | } \ |
177 | inline __device__ double __FnName(unsigned int __mask, double __val, \ |
178 | __Type __offset, int __width = warpSize) { \ |
179 | long long __tmp; \ |
180 | _Static_assert(sizeof(__tmp) == sizeof(__val)); \ |
181 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
182 | __tmp = ::__FnName(__mask, __tmp, __offset, __width); \ |
183 | double __ret; \ |
184 | memcpy(&__ret, &__tmp, sizeof(__ret)); \ |
185 | return __ret; \ |
186 | } |
187 | __MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32, |
188 | __nvvm_shfl_sync_idx_f32, 0x1f, int); |
189 | |
190 | |
191 | __MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32, |
192 | __nvvm_shfl_sync_up_f32, 0, unsigned int); |
193 | __MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32, |
194 | __nvvm_shfl_sync_down_f32, 0x1f, unsigned int); |
195 | __MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32, |
196 | __nvvm_shfl_sync_bfly_f32, 0x1f, int); |
197 | #pragma pop_macro("__MAKE_SYNC_SHUFFLES") |
198 | |
199 | inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) { |
200 | return __nvvm_bar_warp_sync(mask); |
201 | } |
202 | |
203 | inline __device__ void __barrier_sync(unsigned int id) { |
204 | __nvvm_barrier_sync(id); |
205 | } |
206 | |
207 | inline __device__ void __barrier_sync_count(unsigned int id, |
208 | unsigned int count) { |
209 | __nvvm_barrier_sync_cnt(id, count); |
210 | } |
211 | |
212 | inline __device__ int __all_sync(unsigned int mask, int pred) { |
213 | return __nvvm_vote_all_sync(mask, pred); |
214 | } |
215 | |
216 | inline __device__ int __any_sync(unsigned int mask, int pred) { |
217 | return __nvvm_vote_any_sync(mask, pred); |
218 | } |
219 | |
220 | inline __device__ int __uni_sync(unsigned int mask, int pred) { |
221 | return __nvvm_vote_uni_sync(mask, pred); |
222 | } |
223 | |
224 | inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { |
225 | return __nvvm_vote_ballot_sync(mask, pred); |
226 | } |
227 | |
228 | inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); } |
229 | |
230 | inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) { |
231 | return __nvvm_fns(mask, base, offset); |
232 | } |
233 | |
234 | #endif |
235 | |
236 | |
237 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 |
238 | inline __device__ unsigned int __match32_any_sync(unsigned int mask, |
239 | unsigned int value) { |
240 | return __nvvm_match_any_sync_i32(mask, value); |
241 | } |
242 | |
243 | inline __device__ unsigned long long |
244 | __match64_any_sync(unsigned int mask, unsigned long long value) { |
245 | return __nvvm_match_any_sync_i64(mask, value); |
246 | } |
247 | |
248 | inline __device__ unsigned int |
249 | __match32_all_sync(unsigned int mask, unsigned int value, int *pred) { |
250 | return __nvvm_match_all_sync_i32p(mask, value, pred); |
251 | } |
252 | |
253 | inline __device__ unsigned long long |
254 | __match64_all_sync(unsigned int mask, unsigned long long value, int *pred) { |
255 | return __nvvm_match_all_sync_i64p(mask, value, pred); |
256 | } |
257 | #include "crt/sm_70_rt.hpp" |
258 | |
259 | #endif |
260 | #endif |
261 | |
262 | |
263 | |
264 | |
265 | #define __SM_32_INTRINSICS_H__ |
266 | #define __SM_32_INTRINSICS_HPP__ |
267 | |
268 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 |
269 | |
270 | inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); } |
271 | inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); } |
272 | inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); } |
273 | inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); } |
274 | inline __device__ long long __ldg(const long long *ptr) { |
275 | return __nvvm_ldg_ll(ptr); |
276 | } |
277 | inline __device__ unsigned char __ldg(const unsigned char *ptr) { |
278 | return __nvvm_ldg_uc(ptr); |
279 | } |
280 | inline __device__ signed char __ldg(const signed char *ptr) { |
281 | return __nvvm_ldg_uc((const unsigned char *)ptr); |
282 | } |
283 | inline __device__ unsigned short __ldg(const unsigned short *ptr) { |
284 | return __nvvm_ldg_us(ptr); |
285 | } |
286 | inline __device__ unsigned int __ldg(const unsigned int *ptr) { |
287 | return __nvvm_ldg_ui(ptr); |
288 | } |
289 | inline __device__ unsigned long __ldg(const unsigned long *ptr) { |
290 | return __nvvm_ldg_ul(ptr); |
291 | } |
292 | inline __device__ unsigned long long __ldg(const unsigned long long *ptr) { |
293 | return __nvvm_ldg_ull(ptr); |
294 | } |
295 | inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); } |
296 | inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); } |
297 | |
298 | inline __device__ char2 __ldg(const char2 *ptr) { |
299 | typedef char c2 __attribute__((ext_vector_type(2))); |
300 | |
301 | |
302 | |
303 | c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr)); |
304 | char2 ret; |
305 | ret.x = rv[0]; |
306 | ret.y = rv[1]; |
307 | return ret; |
308 | } |
309 | inline __device__ char4 __ldg(const char4 *ptr) { |
310 | typedef char c4 __attribute__((ext_vector_type(4))); |
311 | c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr)); |
312 | char4 ret; |
313 | ret.x = rv[0]; |
314 | ret.y = rv[1]; |
315 | ret.z = rv[2]; |
316 | ret.w = rv[3]; |
317 | return ret; |
318 | } |
319 | inline __device__ short2 __ldg(const short2 *ptr) { |
320 | typedef short s2 __attribute__((ext_vector_type(2))); |
321 | s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr)); |
322 | short2 ret; |
323 | ret.x = rv[0]; |
324 | ret.y = rv[1]; |
325 | return ret; |
326 | } |
327 | inline __device__ short4 __ldg(const short4 *ptr) { |
328 | typedef short s4 __attribute__((ext_vector_type(4))); |
329 | s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr)); |
330 | short4 ret; |
331 | ret.x = rv[0]; |
332 | ret.y = rv[1]; |
333 | ret.z = rv[2]; |
334 | ret.w = rv[3]; |
335 | return ret; |
336 | } |
337 | inline __device__ int2 __ldg(const int2 *ptr) { |
338 | typedef int i2 __attribute__((ext_vector_type(2))); |
339 | i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr)); |
340 | int2 ret; |
341 | ret.x = rv[0]; |
342 | ret.y = rv[1]; |
343 | return ret; |
344 | } |
345 | inline __device__ int4 __ldg(const int4 *ptr) { |
346 | typedef int i4 __attribute__((ext_vector_type(4))); |
347 | i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr)); |
348 | int4 ret; |
349 | ret.x = rv[0]; |
350 | ret.y = rv[1]; |
351 | ret.z = rv[2]; |
352 | ret.w = rv[3]; |
353 | return ret; |
354 | } |
355 | inline __device__ longlong2 __ldg(const longlong2 *ptr) { |
356 | typedef long long ll2 __attribute__((ext_vector_type(2))); |
357 | ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr)); |
358 | longlong2 ret; |
359 | ret.x = rv[0]; |
360 | ret.y = rv[1]; |
361 | return ret; |
362 | } |
363 | |
364 | inline __device__ uchar2 __ldg(const uchar2 *ptr) { |
365 | typedef unsigned char uc2 __attribute__((ext_vector_type(2))); |
366 | uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr)); |
367 | uchar2 ret; |
368 | ret.x = rv[0]; |
369 | ret.y = rv[1]; |
370 | return ret; |
371 | } |
372 | inline __device__ uchar4 __ldg(const uchar4 *ptr) { |
373 | typedef unsigned char uc4 __attribute__((ext_vector_type(4))); |
374 | uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr)); |
375 | uchar4 ret; |
376 | ret.x = rv[0]; |
377 | ret.y = rv[1]; |
378 | ret.z = rv[2]; |
379 | ret.w = rv[3]; |
380 | return ret; |
381 | } |
382 | inline __device__ ushort2 __ldg(const ushort2 *ptr) { |
383 | typedef unsigned short us2 __attribute__((ext_vector_type(2))); |
384 | us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr)); |
385 | ushort2 ret; |
386 | ret.x = rv[0]; |
387 | ret.y = rv[1]; |
388 | return ret; |
389 | } |
390 | inline __device__ ushort4 __ldg(const ushort4 *ptr) { |
391 | typedef unsigned short us4 __attribute__((ext_vector_type(4))); |
392 | us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr)); |
393 | ushort4 ret; |
394 | ret.x = rv[0]; |
395 | ret.y = rv[1]; |
396 | ret.z = rv[2]; |
397 | ret.w = rv[3]; |
398 | return ret; |
399 | } |
400 | inline __device__ uint2 __ldg(const uint2 *ptr) { |
401 | typedef unsigned int ui2 __attribute__((ext_vector_type(2))); |
402 | ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr)); |
403 | uint2 ret; |
404 | ret.x = rv[0]; |
405 | ret.y = rv[1]; |
406 | return ret; |
407 | } |
408 | inline __device__ uint4 __ldg(const uint4 *ptr) { |
409 | typedef unsigned int ui4 __attribute__((ext_vector_type(4))); |
410 | ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr)); |
411 | uint4 ret; |
412 | ret.x = rv[0]; |
413 | ret.y = rv[1]; |
414 | ret.z = rv[2]; |
415 | ret.w = rv[3]; |
416 | return ret; |
417 | } |
418 | inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) { |
419 | typedef unsigned long long ull2 __attribute__((ext_vector_type(2))); |
420 | ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr)); |
421 | ulonglong2 ret; |
422 | ret.x = rv[0]; |
423 | ret.y = rv[1]; |
424 | return ret; |
425 | } |
426 | |
427 | inline __device__ float2 __ldg(const float2 *ptr) { |
428 | typedef float f2 __attribute__((ext_vector_type(2))); |
429 | f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr)); |
430 | float2 ret; |
431 | ret.x = rv[0]; |
432 | ret.y = rv[1]; |
433 | return ret; |
434 | } |
435 | inline __device__ float4 __ldg(const float4 *ptr) { |
436 | typedef float f4 __attribute__((ext_vector_type(4))); |
437 | f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr)); |
438 | float4 ret; |
439 | ret.x = rv[0]; |
440 | ret.y = rv[1]; |
441 | ret.z = rv[2]; |
442 | ret.w = rv[3]; |
443 | return ret; |
444 | } |
445 | inline __device__ double2 __ldg(const double2 *ptr) { |
446 | typedef double d2 __attribute__((ext_vector_type(2))); |
447 | d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr)); |
448 | double2 ret; |
449 | ret.x = rv[0]; |
450 | ret.y = rv[1]; |
451 | return ret; |
452 | } |
453 | |
454 | |
455 | |
456 | |
457 | inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, |
458 | unsigned shiftWidth) { |
459 | unsigned result; |
460 | asm("shf.l.wrap.b32 %0, %1, %2, %3;" |
461 | : "=r"(result) |
462 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
463 | return result; |
464 | } |
465 | inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, |
466 | unsigned shiftWidth) { |
467 | unsigned result; |
468 | asm("shf.l.clamp.b32 %0, %1, %2, %3;" |
469 | : "=r"(result) |
470 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
471 | return result; |
472 | } |
473 | inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, |
474 | unsigned shiftWidth) { |
475 | unsigned result; |
476 | asm("shf.r.wrap.b32 %0, %1, %2, %3;" |
477 | : "=r"(result) |
478 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
479 | return result; |
480 | } |
481 | inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, |
482 | unsigned shiftWidth) { |
483 | unsigned ret; |
484 | asm("shf.r.clamp.b32 %0, %1, %2, %3;" |
485 | : "=r"(ret) |
486 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
487 | return ret; |
488 | } |
489 | |
490 | #endif |
491 | |
492 | #endif |
493 | |