1 | // RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm -o - | FileCheck %s |
2 | |
3 | #include "Inputs/cuda.h" |
4 | |
5 | #define MAX_THREADS_PER_BLOCK 256 |
6 | #define MIN_BLOCKS_PER_MP 2 |
7 | |
8 | // Test both max threads per block and Min cta per sm. |
9 | extern "C" { |
10 | __global__ void |
11 | __launch_bounds__( MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP ) |
12 | Kernel1() |
13 | { |
14 | } |
15 | } |
16 | |
17 | // CHECK: !{{[0-9]+}} = !{void ()* @Kernel1, !"maxntidx", i32 256} |
18 | // CHECK: !{{[0-9]+}} = !{void ()* @Kernel1, !"minctasm", i32 2} |
19 | |
20 | // Test only max threads per block. Min cta per sm defaults to 0, and |
21 | // CodeGen doesn't output a zero value for minctasm. |
22 | extern "C" { |
23 | __global__ void |
24 | __launch_bounds__( MAX_THREADS_PER_BLOCK ) |
25 | Kernel2() |
26 | { |
27 | } |
28 | } |
29 | |
30 | // CHECK: !{{[0-9]+}} = !{void ()* @Kernel2, !"maxntidx", i32 256} |
31 | |
32 | template <int max_threads_per_block> |
33 | __global__ void |
34 | __launch_bounds__(max_threads_per_block) |
35 | Kernel3() |
36 | { |
37 | } |
38 | |
39 | template __global__ void Kernel3<MAX_THREADS_PER_BLOCK>(); |
40 | // CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel3{{.*}}, !"maxntidx", i32 256} |
41 | |
42 | template <int max_threads_per_block, int min_blocks_per_mp> |
43 | __global__ void |
44 | __launch_bounds__(max_threads_per_block, min_blocks_per_mp) |
45 | Kernel4() |
46 | { |
47 | } |
48 | template __global__ void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>(); |
49 | |
50 | // CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256} |
51 | // CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel4{{.*}}, !"minctasm", i32 2} |
52 | |
53 | const int constint = 100; |
54 | template <int max_threads_per_block, int min_blocks_per_mp> |
55 | __global__ void |
56 | __launch_bounds__(max_threads_per_block + constint, |
57 | min_blocks_per_mp + max_threads_per_block) |
58 | Kernel5() |
59 | { |
60 | } |
61 | template __global__ void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>(); |
62 | |
63 | // CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356} |
64 | // CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel5{{.*}}, !"minctasm", i32 258} |
65 | |
66 | // Make sure we don't emit negative launch bounds values. |
67 | __global__ void |
68 | __launch_bounds__( -MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP ) |
69 | Kernel6() |
70 | { |
71 | } |
72 | // CHECK-NOT: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel6{{.*}}, !"maxntidx", |
73 | // CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel6{{.*}}, !"minctasm", |
74 | |
75 | __global__ void |
76 | __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP ) |
77 | Kernel7() |
78 | { |
79 | } |
80 | // CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel7{{.*}}, !"maxntidx", |
81 | // CHECK-NOT: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel7{{.*}}, !"minctasm", |
82 | |
83 | const char constchar = 12; |
84 | __global__ void __launch_bounds__(constint, constchar) Kernel8() {} |
85 | // CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100 |
86 | // CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel8{{.*}}, !"minctasm", i32 12 |
87 | |