| 1 | |
| 2 | |
| 3 | |
| 4 | |
| 5 | |
| 6 | |
| 7 | |
| 8 | |
| 9 | |
| 10 | |
| 11 | |
| 12 | |
| 13 | |
| 14 | |
| 15 | |
| 16 | |
| 17 | |
| 18 | |
| 19 | |
| 20 | |
| 21 | |
| 22 | |
| 23 | #ifndef __CLANG_CUDA_INTRINSICS_H__ |
| 24 | #define __CLANG_CUDA_INTRINSICS_H__ |
| 25 | #ifndef __CUDA__ |
| 26 | #error "This file is for CUDA compilation only." |
| 27 | #endif |
| 28 | |
| 29 | |
| 30 | |
| 31 | #define __SM_30_INTRINSICS_H__ |
| 32 | #define __SM_30_INTRINSICS_HPP__ |
| 33 | |
| 34 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 |
| 35 | |
| 36 | #pragma push_macro("__MAKE_SHUFFLES") |
| 37 | #define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \ |
| 38 | __Type) \ |
| 39 | inline __device__ int __FnName(int __val, __Type __offset, \ |
| 40 | int __width = warpSize) { \ |
| 41 | return __IntIntrinsic(__val, __offset, \ |
| 42 | ((warpSize - __width) << 8) | (__Mask)); \ |
| 43 | } \ |
| 44 | inline __device__ float __FnName(float __val, __Type __offset, \ |
| 45 | int __width = warpSize) { \ |
| 46 | return __FloatIntrinsic(__val, __offset, \ |
| 47 | ((warpSize - __width) << 8) | (__Mask)); \ |
| 48 | } \ |
| 49 | inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \ |
| 50 | int __width = warpSize) { \ |
| 51 | return static_cast<unsigned int>( \ |
| 52 | ::__FnName(static_cast<int>(__val), __offset, __width)); \ |
| 53 | } \ |
| 54 | inline __device__ long long __FnName(long long __val, __Type __offset, \ |
| 55 | int __width = warpSize) { \ |
| 56 | struct __Bits { \ |
| 57 | int __a, __b; \ |
| 58 | }; \ |
| 59 | _Static_assert(sizeof(__val) == sizeof(__Bits)); \ |
| 60 | _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ |
| 61 | __Bits __tmp; \ |
| 62 | memcpy(&__val, &__tmp, sizeof(__val)); \ |
| 63 | __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \ |
| 64 | __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \ |
| 65 | long long __ret; \ |
| 66 | memcpy(&__ret, &__tmp, sizeof(__tmp)); \ |
| 67 | return __ret; \ |
| 68 | } \ |
| 69 | inline __device__ long __FnName(long __val, __Type __offset, \ |
| 70 | int __width = warpSize) { \ |
| 71 | _Static_assert(sizeof(long) == sizeof(long long) || \ |
| 72 | sizeof(long) == sizeof(int)); \ |
| 73 | if (sizeof(long) == sizeof(long long)) { \ |
| 74 | return static_cast<long>( \ |
| 75 | ::__FnName(static_cast<long long>(__val), __offset, __width)); \ |
| 76 | } else if (sizeof(long) == sizeof(int)) { \ |
| 77 | return static_cast<long>( \ |
| 78 | ::__FnName(static_cast<int>(__val), __offset, __width)); \ |
| 79 | } \ |
| 80 | } \ |
| 81 | inline __device__ unsigned long __FnName( \ |
| 82 | unsigned long __val, __Type __offset, int __width = warpSize) { \ |
| 83 | return static_cast<unsigned long>( \ |
| 84 | ::__FnName(static_cast<long>(__val), __offset, __width)); \ |
| 85 | } \ |
| 86 | inline __device__ unsigned long long __FnName( \ |
| 87 | unsigned long long __val, __Type __offset, int __width = warpSize) { \ |
| 88 | return static_cast<unsigned long long>(::__FnName( \ |
| 89 | static_cast<unsigned long long>(__val), __offset, __width)); \ |
| 90 | } \ |
| 91 | inline __device__ double __FnName(double __val, __Type __offset, \ |
| 92 | int __width = warpSize) { \ |
| 93 | long long __tmp; \ |
| 94 | _Static_assert(sizeof(__tmp) == sizeof(__val)); \ |
| 95 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
| 96 | __tmp = ::__FnName(__tmp, __offset, __width); \ |
| 97 | double __ret; \ |
| 98 | memcpy(&__ret, &__tmp, sizeof(__ret)); \ |
| 99 | return __ret; \ |
| 100 | } |
| 101 | |
| 102 | __MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int); |
| 103 | |
| 104 | |
| 105 | __MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0, |
| 106 | unsigned int); |
| 107 | __MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f, |
| 108 | unsigned int); |
| 109 | __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f, |
| 110 | int); |
| 111 | #pragma pop_macro("__MAKE_SHUFFLES") |
| 112 | |
| 113 | #endif |
| 114 | |
| 115 | #if CUDA_VERSION >= 9000 |
| 116 | #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) |
| 117 | |
| 118 | #pragma push_macro("__MAKE_SYNC_SHUFFLES") |
| 119 | #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ |
| 120 | __Mask, __Type) \ |
| 121 | inline __device__ int __FnName(unsigned int __mask, int __val, \ |
| 122 | __Type __offset, int __width = warpSize) { \ |
| 123 | return __IntIntrinsic(__mask, __val, __offset, \ |
| 124 | ((warpSize - __width) << 8) | (__Mask)); \ |
| 125 | } \ |
| 126 | inline __device__ float __FnName(unsigned int __mask, float __val, \ |
| 127 | __Type __offset, int __width = warpSize) { \ |
| 128 | return __FloatIntrinsic(__mask, __val, __offset, \ |
| 129 | ((warpSize - __width) << 8) | (__Mask)); \ |
| 130 | } \ |
| 131 | inline __device__ unsigned int __FnName(unsigned int __mask, \ |
| 132 | unsigned int __val, __Type __offset, \ |
| 133 | int __width = warpSize) { \ |
| 134 | return static_cast<unsigned int>( \ |
| 135 | ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ |
| 136 | } \ |
| 137 | inline __device__ long long __FnName(unsigned int __mask, long long __val, \ |
| 138 | __Type __offset, \ |
| 139 | int __width = warpSize) { \ |
| 140 | struct __Bits { \ |
| 141 | int __a, __b; \ |
| 142 | }; \ |
| 143 | _Static_assert(sizeof(__val) == sizeof(__Bits)); \ |
| 144 | _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ |
| 145 | __Bits __tmp; \ |
| 146 | memcpy(&__val, &__tmp, sizeof(__val)); \ |
| 147 | __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \ |
| 148 | __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \ |
| 149 | long long __ret; \ |
| 150 | memcpy(&__ret, &__tmp, sizeof(__tmp)); \ |
| 151 | return __ret; \ |
| 152 | } \ |
| 153 | inline __device__ unsigned long long __FnName( \ |
| 154 | unsigned int __mask, unsigned long long __val, __Type __offset, \ |
| 155 | int __width = warpSize) { \ |
| 156 | return static_cast<unsigned long long>(::__FnName( \ |
| 157 | __mask, static_cast<unsigned long long>(__val), __offset, __width)); \ |
| 158 | } \ |
| 159 | inline __device__ long __FnName(unsigned int __mask, long __val, \ |
| 160 | __Type __offset, int __width = warpSize) { \ |
| 161 | _Static_assert(sizeof(long) == sizeof(long long) || \ |
| 162 | sizeof(long) == sizeof(int)); \ |
| 163 | if (sizeof(long) == sizeof(long long)) { \ |
| 164 | return static_cast<long>(::__FnName( \ |
| 165 | __mask, static_cast<long long>(__val), __offset, __width)); \ |
| 166 | } else if (sizeof(long) == sizeof(int)) { \ |
| 167 | return static_cast<long>( \ |
| 168 | ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ |
| 169 | } \ |
| 170 | } \ |
| 171 | inline __device__ unsigned long __FnName( \ |
| 172 | unsigned int __mask, unsigned long __val, __Type __offset, \ |
| 173 | int __width = warpSize) { \ |
| 174 | return static_cast<unsigned long>( \ |
| 175 | ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \ |
| 176 | } \ |
| 177 | inline __device__ double __FnName(unsigned int __mask, double __val, \ |
| 178 | __Type __offset, int __width = warpSize) { \ |
| 179 | long long __tmp; \ |
| 180 | _Static_assert(sizeof(__tmp) == sizeof(__val)); \ |
| 181 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
| 182 | __tmp = ::__FnName(__mask, __tmp, __offset, __width); \ |
| 183 | double __ret; \ |
| 184 | memcpy(&__ret, &__tmp, sizeof(__ret)); \ |
| 185 | return __ret; \ |
| 186 | } |
| 187 | __MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32, |
| 188 | __nvvm_shfl_sync_idx_f32, 0x1f, int); |
| 189 | |
| 190 | |
| 191 | __MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32, |
| 192 | __nvvm_shfl_sync_up_f32, 0, unsigned int); |
| 193 | __MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32, |
| 194 | __nvvm_shfl_sync_down_f32, 0x1f, unsigned int); |
| 195 | __MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32, |
| 196 | __nvvm_shfl_sync_bfly_f32, 0x1f, int); |
| 197 | #pragma pop_macro("__MAKE_SYNC_SHUFFLES") |
| 198 | |
| 199 | inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) { |
| 200 | return __nvvm_bar_warp_sync(mask); |
| 201 | } |
| 202 | |
| 203 | inline __device__ void __barrier_sync(unsigned int id) { |
| 204 | __nvvm_barrier_sync(id); |
| 205 | } |
| 206 | |
| 207 | inline __device__ void __barrier_sync_count(unsigned int id, |
| 208 | unsigned int count) { |
| 209 | __nvvm_barrier_sync_cnt(id, count); |
| 210 | } |
| 211 | |
| 212 | inline __device__ int __all_sync(unsigned int mask, int pred) { |
| 213 | return __nvvm_vote_all_sync(mask, pred); |
| 214 | } |
| 215 | |
| 216 | inline __device__ int __any_sync(unsigned int mask, int pred) { |
| 217 | return __nvvm_vote_any_sync(mask, pred); |
| 218 | } |
| 219 | |
| 220 | inline __device__ int __uni_sync(unsigned int mask, int pred) { |
| 221 | return __nvvm_vote_uni_sync(mask, pred); |
| 222 | } |
| 223 | |
| 224 | inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { |
| 225 | return __nvvm_vote_ballot_sync(mask, pred); |
| 226 | } |
| 227 | |
| 228 | inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); } |
| 229 | |
| 230 | inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) { |
| 231 | return __nvvm_fns(mask, base, offset); |
| 232 | } |
| 233 | |
| 234 | #endif |
| 235 | |
| 236 | |
| 237 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 |
| 238 | inline __device__ unsigned int __match32_any_sync(unsigned int mask, |
| 239 | unsigned int value) { |
| 240 | return __nvvm_match_any_sync_i32(mask, value); |
| 241 | } |
| 242 | |
| 243 | inline __device__ unsigned long long |
| 244 | __match64_any_sync(unsigned int mask, unsigned long long value) { |
| 245 | return __nvvm_match_any_sync_i64(mask, value); |
| 246 | } |
| 247 | |
| 248 | inline __device__ unsigned int |
| 249 | __match32_all_sync(unsigned int mask, unsigned int value, int *pred) { |
| 250 | return __nvvm_match_all_sync_i32p(mask, value, pred); |
| 251 | } |
| 252 | |
| 253 | inline __device__ unsigned long long |
| 254 | __match64_all_sync(unsigned int mask, unsigned long long value, int *pred) { |
| 255 | return __nvvm_match_all_sync_i64p(mask, value, pred); |
| 256 | } |
| 257 | #include "crt/sm_70_rt.hpp" |
| 258 | |
| 259 | #endif |
| 260 | #endif |
| 261 | |
| 262 | |
| 263 | |
| 264 | |
| 265 | #define __SM_32_INTRINSICS_H__ |
| 266 | #define __SM_32_INTRINSICS_HPP__ |
| 267 | |
| 268 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 |
| 269 | |
| 270 | inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); } |
| 271 | inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); } |
| 272 | inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); } |
| 273 | inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); } |
| 274 | inline __device__ long long __ldg(const long long *ptr) { |
| 275 | return __nvvm_ldg_ll(ptr); |
| 276 | } |
| 277 | inline __device__ unsigned char __ldg(const unsigned char *ptr) { |
| 278 | return __nvvm_ldg_uc(ptr); |
| 279 | } |
| 280 | inline __device__ signed char __ldg(const signed char *ptr) { |
| 281 | return __nvvm_ldg_uc((const unsigned char *)ptr); |
| 282 | } |
| 283 | inline __device__ unsigned short __ldg(const unsigned short *ptr) { |
| 284 | return __nvvm_ldg_us(ptr); |
| 285 | } |
| 286 | inline __device__ unsigned int __ldg(const unsigned int *ptr) { |
| 287 | return __nvvm_ldg_ui(ptr); |
| 288 | } |
| 289 | inline __device__ unsigned long __ldg(const unsigned long *ptr) { |
| 290 | return __nvvm_ldg_ul(ptr); |
| 291 | } |
| 292 | inline __device__ unsigned long long __ldg(const unsigned long long *ptr) { |
| 293 | return __nvvm_ldg_ull(ptr); |
| 294 | } |
| 295 | inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); } |
| 296 | inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); } |
| 297 | |
| 298 | inline __device__ char2 __ldg(const char2 *ptr) { |
| 299 | typedef char c2 __attribute__((ext_vector_type(2))); |
| 300 | |
| 301 | |
| 302 | |
| 303 | c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr)); |
| 304 | char2 ret; |
| 305 | ret.x = rv[0]; |
| 306 | ret.y = rv[1]; |
| 307 | return ret; |
| 308 | } |
| 309 | inline __device__ char4 __ldg(const char4 *ptr) { |
| 310 | typedef char c4 __attribute__((ext_vector_type(4))); |
| 311 | c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr)); |
| 312 | char4 ret; |
| 313 | ret.x = rv[0]; |
| 314 | ret.y = rv[1]; |
| 315 | ret.z = rv[2]; |
| 316 | ret.w = rv[3]; |
| 317 | return ret; |
| 318 | } |
| 319 | inline __device__ short2 __ldg(const short2 *ptr) { |
| 320 | typedef short s2 __attribute__((ext_vector_type(2))); |
| 321 | s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr)); |
| 322 | short2 ret; |
| 323 | ret.x = rv[0]; |
| 324 | ret.y = rv[1]; |
| 325 | return ret; |
| 326 | } |
| 327 | inline __device__ short4 __ldg(const short4 *ptr) { |
| 328 | typedef short s4 __attribute__((ext_vector_type(4))); |
| 329 | s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr)); |
| 330 | short4 ret; |
| 331 | ret.x = rv[0]; |
| 332 | ret.y = rv[1]; |
| 333 | ret.z = rv[2]; |
| 334 | ret.w = rv[3]; |
| 335 | return ret; |
| 336 | } |
| 337 | inline __device__ int2 __ldg(const int2 *ptr) { |
| 338 | typedef int i2 __attribute__((ext_vector_type(2))); |
| 339 | i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr)); |
| 340 | int2 ret; |
| 341 | ret.x = rv[0]; |
| 342 | ret.y = rv[1]; |
| 343 | return ret; |
| 344 | } |
| 345 | inline __device__ int4 __ldg(const int4 *ptr) { |
| 346 | typedef int i4 __attribute__((ext_vector_type(4))); |
| 347 | i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr)); |
| 348 | int4 ret; |
| 349 | ret.x = rv[0]; |
| 350 | ret.y = rv[1]; |
| 351 | ret.z = rv[2]; |
| 352 | ret.w = rv[3]; |
| 353 | return ret; |
| 354 | } |
| 355 | inline __device__ longlong2 __ldg(const longlong2 *ptr) { |
| 356 | typedef long long ll2 __attribute__((ext_vector_type(2))); |
| 357 | ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr)); |
| 358 | longlong2 ret; |
| 359 | ret.x = rv[0]; |
| 360 | ret.y = rv[1]; |
| 361 | return ret; |
| 362 | } |
| 363 | |
| 364 | inline __device__ uchar2 __ldg(const uchar2 *ptr) { |
| 365 | typedef unsigned char uc2 __attribute__((ext_vector_type(2))); |
| 366 | uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr)); |
| 367 | uchar2 ret; |
| 368 | ret.x = rv[0]; |
| 369 | ret.y = rv[1]; |
| 370 | return ret; |
| 371 | } |
| 372 | inline __device__ uchar4 __ldg(const uchar4 *ptr) { |
| 373 | typedef unsigned char uc4 __attribute__((ext_vector_type(4))); |
| 374 | uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr)); |
| 375 | uchar4 ret; |
| 376 | ret.x = rv[0]; |
| 377 | ret.y = rv[1]; |
| 378 | ret.z = rv[2]; |
| 379 | ret.w = rv[3]; |
| 380 | return ret; |
| 381 | } |
| 382 | inline __device__ ushort2 __ldg(const ushort2 *ptr) { |
| 383 | typedef unsigned short us2 __attribute__((ext_vector_type(2))); |
| 384 | us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr)); |
| 385 | ushort2 ret; |
| 386 | ret.x = rv[0]; |
| 387 | ret.y = rv[1]; |
| 388 | return ret; |
| 389 | } |
| 390 | inline __device__ ushort4 __ldg(const ushort4 *ptr) { |
| 391 | typedef unsigned short us4 __attribute__((ext_vector_type(4))); |
| 392 | us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr)); |
| 393 | ushort4 ret; |
| 394 | ret.x = rv[0]; |
| 395 | ret.y = rv[1]; |
| 396 | ret.z = rv[2]; |
| 397 | ret.w = rv[3]; |
| 398 | return ret; |
| 399 | } |
| 400 | inline __device__ uint2 __ldg(const uint2 *ptr) { |
| 401 | typedef unsigned int ui2 __attribute__((ext_vector_type(2))); |
| 402 | ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr)); |
| 403 | uint2 ret; |
| 404 | ret.x = rv[0]; |
| 405 | ret.y = rv[1]; |
| 406 | return ret; |
| 407 | } |
| 408 | inline __device__ uint4 __ldg(const uint4 *ptr) { |
| 409 | typedef unsigned int ui4 __attribute__((ext_vector_type(4))); |
| 410 | ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr)); |
| 411 | uint4 ret; |
| 412 | ret.x = rv[0]; |
| 413 | ret.y = rv[1]; |
| 414 | ret.z = rv[2]; |
| 415 | ret.w = rv[3]; |
| 416 | return ret; |
| 417 | } |
| 418 | inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) { |
| 419 | typedef unsigned long long ull2 __attribute__((ext_vector_type(2))); |
| 420 | ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr)); |
| 421 | ulonglong2 ret; |
| 422 | ret.x = rv[0]; |
| 423 | ret.y = rv[1]; |
| 424 | return ret; |
| 425 | } |
| 426 | |
| 427 | inline __device__ float2 __ldg(const float2 *ptr) { |
| 428 | typedef float f2 __attribute__((ext_vector_type(2))); |
| 429 | f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr)); |
| 430 | float2 ret; |
| 431 | ret.x = rv[0]; |
| 432 | ret.y = rv[1]; |
| 433 | return ret; |
| 434 | } |
| 435 | inline __device__ float4 __ldg(const float4 *ptr) { |
| 436 | typedef float f4 __attribute__((ext_vector_type(4))); |
| 437 | f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr)); |
| 438 | float4 ret; |
| 439 | ret.x = rv[0]; |
| 440 | ret.y = rv[1]; |
| 441 | ret.z = rv[2]; |
| 442 | ret.w = rv[3]; |
| 443 | return ret; |
| 444 | } |
| 445 | inline __device__ double2 __ldg(const double2 *ptr) { |
| 446 | typedef double d2 __attribute__((ext_vector_type(2))); |
| 447 | d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr)); |
| 448 | double2 ret; |
| 449 | ret.x = rv[0]; |
| 450 | ret.y = rv[1]; |
| 451 | return ret; |
| 452 | } |
| 453 | |
| 454 | |
| 455 | |
| 456 | |
| 457 | inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, |
| 458 | unsigned shiftWidth) { |
| 459 | unsigned result; |
| 460 | asm("shf.l.wrap.b32 %0, %1, %2, %3;" |
| 461 | : "=r"(result) |
| 462 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
| 463 | return result; |
| 464 | } |
| 465 | inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, |
| 466 | unsigned shiftWidth) { |
| 467 | unsigned result; |
| 468 | asm("shf.l.clamp.b32 %0, %1, %2, %3;" |
| 469 | : "=r"(result) |
| 470 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
| 471 | return result; |
| 472 | } |
| 473 | inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, |
| 474 | unsigned shiftWidth) { |
| 475 | unsigned result; |
| 476 | asm("shf.r.wrap.b32 %0, %1, %2, %3;" |
| 477 | : "=r"(result) |
| 478 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
| 479 | return result; |
| 480 | } |
| 481 | inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, |
| 482 | unsigned shiftWidth) { |
| 483 | unsigned ret; |
| 484 | asm("shf.r.clamp.b32 %0, %1, %2, %3;" |
| 485 | : "=r"(ret) |
| 486 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
| 487 | return ret; |
| 488 | } |
| 489 | |
| 490 | #endif |
| 491 | |
| 492 | #endif |
| 493 | |