1 | // RUN: echo "GPU binary would be here" > %t |
2 | // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ |
3 | // RUN: -target-sdk-version=8.0 -fcuda-include-gpubinary %t -o - \ |
4 | // RUN: | FileCheck -allow-deprecated-dag-overlap %s \ |
5 | // RUN: --check-prefixes=ALL,LNX,NORDC,CUDA,CUDANORDC,CUDA-OLD |
6 | // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ |
7 | // RUN: -target-sdk-version=8.0 -fcuda-include-gpubinary %t \ |
8 | // RUN: -o - -DNOGLOBALS \ |
9 | // RUN: | FileCheck -allow-deprecated-dag-overlap %s \ |
10 | // RUN: -check-prefixes=NOGLOBALS,CUDANOGLOBALS |
11 | // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ |
12 | // RUN: -target-sdk-version=8.0 -fgpu-rdc -fcuda-include-gpubinary %t \ |
13 | // RUN: -o - \ |
14 | // RUN: | FileCheck -allow-deprecated-dag-overlap %s \ |
15 | // RUN: --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA-OLD |
16 | // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ |
17 | // RUN: -target-sdk-version=8.0 -o - \ |
18 | // RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN |
19 | |
20 | // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ |
21 | // RUN: -target-sdk-version=9.2 -fcuda-include-gpubinary %t -o - \ |
22 | // RUN: | FileCheck %s -allow-deprecated-dag-overlap \ |
23 | // RUN: --check-prefixes=ALL,LNX,NORDC,CUDA,CUDANORDC,CUDA-NEW |
24 | // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ |
25 | // RUN: -target-sdk-version=9.2 -fcuda-include-gpubinary %t -o - -DNOGLOBALS \ |
26 | // RUN: | FileCheck -allow-deprecated-dag-overlap %s \ |
27 | // RUN: --check-prefixes=NOGLOBALS,CUDANOGLOBALS |
28 | // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ |
29 | // RUN: -target-sdk-version=9.2 -fgpu-rdc -fcuda-include-gpubinary %t -o - \ |
30 | // RUN: | FileCheck %s -allow-deprecated-dag-overlap \ |
31 | // RUN: --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA_NEW |
32 | // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ |
33 | // RUN: -target-sdk-version=9.2 -o - \ |
34 | // RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN |
35 | |
36 | // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ |
37 | // RUN: -fcuda-include-gpubinary %t -o - -x hip\ |
38 | // RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,LNX,NORDC,HIP,HIPEF |
39 | // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ |
40 | // RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS -x hip \ |
41 | // RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS |
42 | // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ |
43 | // RUN: -fgpu-rdc -fcuda-include-gpubinary %t -o - -x hip \ |
44 | // RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,LNX,NORDC,HIP,HIPEF |
45 | // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\ |
46 | // RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=ALL,LNX,NORDC,HIP,HIPNEF |
47 | |
48 | // RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -aux-triple amdgcn -emit-llvm %s \ |
49 | // RUN: -fcuda-include-gpubinary %t -o - -x hip\ |
50 | // RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,WIN |
51 | |
52 | #include "Inputs/cuda.h" |
53 | |
54 | #ifndef NOGLOBALS |
55 | // LNX-DAG: @device_var = internal global i32 |
56 | // WIN-DAG: @"?device_var@@3HA" = internal global i32 |
57 | __device__ int device_var; |
58 | |
59 | // LNX-DAG: @constant_var = internal global i32 |
60 | // WIN-DAG: @"?constant_var@@3HA" = internal global i32 |
61 | __constant__ int constant_var; |
62 | |
63 | // LNX-DAG: @shared_var = internal global i32 |
64 | // WIN-DAG: @"?shared_var@@3HA" = internal global i32 |
65 | __shared__ int shared_var; |
66 | |
67 | // Make sure host globals don't get internalized... |
68 | // LNX-DAG: @host_var = global i32 |
69 | // WIN-DAG: @"?host_var@@3HA" = dso_local global i32 |
70 | int host_var; |
71 | // ... and that extern vars remain external. |
72 | // LNX-DAG: @ext_host_var = external global i32 |
73 | // WIN-DAG: @"?ext_host_var@@3HA" = external dso_local global i32 |
74 | extern int ext_host_var; |
75 | |
76 | // external device-side variables -> extern references to their shadows. |
77 | // LNX-DAG: @ext_device_var = external global i32 |
78 | // WIN-DAG: @"?ext_device_var@@3HA" = external dso_local global i32 |
79 | extern __device__ int ext_device_var; |
80 | // LNX-DAG: @ext_device_var = external global i32 |
81 | // WIN-DAG: @"?ext_constant_var@@3HA" = external dso_local global i32 |
82 | extern __constant__ int ext_constant_var; |
83 | |
84 | // external device-side variables with definitions should generate |
85 | // definitions for the shadows. |
86 | // LNX-DAG: @ext_device_var_def = internal global i32 undef, |
87 | // WIN-DAG: @"?ext_device_var_def@@3HA" = internal global i32 undef |
88 | extern __device__ int ext_device_var_def; |
89 | __device__ int ext_device_var_def = 1; |
90 | // LNX-DAG: @ext_device_var_def = internal global i32 undef, |
91 | // WIN-DAG: @"?ext_constant_var_def@@3HA" = internal global i32 undef |
92 | __constant__ int ext_constant_var_def = 2; |
93 | |
94 | |
95 | void use_pointers() { |
96 | int *p; |
97 | p = &device_var; |
98 | p = &constant_var; |
99 | p = &shared_var; |
100 | p = &host_var; |
101 | p = &ext_device_var; |
102 | p = &ext_constant_var; |
103 | p = &ext_host_var; |
104 | } |
105 | |
106 | // Make sure that all parts of GPU code init/cleanup are there: |
107 | // * constant unnamed string with the device-side kernel name to be passed to |
108 | // __hipRegisterFunction/__cudaRegisterFunction. |
109 | // ALL: @0 = private unnamed_addr constant [18 x i8] c"_Z10kernelfunciii\00" |
110 | // * constant unnamed string with the device-side kernel name to be passed to |
111 | // __hipRegisterVar/__cudaRegisterVar. |
112 | // ALL: @1 = private unnamed_addr constant [11 x i8] c"device_var\00" |
113 | // ALL: @2 = private unnamed_addr constant [13 x i8] c"constant_var\00" |
114 | // ALL: @3 = private unnamed_addr constant [19 x i8] c"ext_device_var_def\00" |
115 | // ALL: @4 = private unnamed_addr constant [21 x i8] c"ext_constant_var_def\00" |
116 | // * constant unnamed string with GPU binary |
117 | // CUDA: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00", |
118 | // HIPEF: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00", |
119 | // HIPNEF: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin" |
120 | // CUDANORDC-SAME: section ".nv_fatbin", align 8 |
121 | // CUDARDC-SAME: section "__nv_relfatbin", align 8 |
122 | // * constant struct that wraps GPU binary |
123 | // ALL: @__[[PREFIX:cuda|hip]]_fatbin_wrapper = internal constant |
124 | // LNX-SAME: { i32, i32, i8*, i8* } |
125 | // CUDA-SAME: { i32 1180844977, i32 1, |
126 | // HIP-SAME: { i32 1212764230, i32 1, |
127 | // CUDA-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0), |
128 | // HIPEF-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0), |
129 | // HIPNEF-SAME: i8* @[[FATBIN]], |
130 | // LNX-SAME: i8* null } |
131 | // CUDA-SAME: section ".nvFatBinSegment" |
132 | // HIP-SAME: section ".hipFatBinSegment" |
133 | // * variable to save GPU binary handle after initialization |
134 | // CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null |
135 | // HIPNEF: @__[[PREFIX]]_gpubin_handle = linkonce hidden global i8** null |
136 | // * constant unnamed string with NVModuleID |
137 | // RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant |
138 | // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32 |
139 | // * Make sure our constructor was added to global ctor list. |
140 | // LNX: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor |
141 | // * Alias to global symbol containing the NVModuleID. |
142 | // RDC: @__fatbinwrap[[MODULE_ID]] = alias { i32, i32, i8*, i8* } |
143 | // RDC-SAME: { i32, i32, i8*, i8* }* @__[[PREFIX]]_fatbin_wrapper |
144 | |
145 | // Test that we build the correct number of calls to cudaSetupArgument followed |
146 | // by a call to cudaLaunch. |
147 | |
148 | // LNX: define{{.*}}kernelfunc |
149 | |
150 | // New launch sequence stores arguments into local buffer and passes array of |
151 | // pointers to them directly to cudaLaunchKernel |
152 | // CUDA-NEW: alloca |
153 | // CUDA-NEW: store |
154 | // CUDA-NEW: store |
155 | // CUDA-NEW: store |
156 | // CUDA-NEW: call{{.*}}__cudaPopCallConfiguration |
157 | // CUDA-NEW: call{{.*}}cudaLaunchKernel |
158 | |
159 | // Legacy style launch sequence sets up arguments by passing them to |
160 | // [cuda|hip]SetupArgument. |
161 | // CUDA-OLD: call{{.*}}[[PREFIX]]SetupArgument |
162 | // CUDA-OLD: call{{.*}}[[PREFIX]]SetupArgument |
163 | // CUDA-OLD: call{{.*}}[[PREFIX]]SetupArgument |
164 | // CUDA-OLD: call{{.*}}[[PREFIX]]Launch |
165 | |
166 | // HIP: call{{.*}}[[PREFIX]]SetupArgument |
167 | // HIP: call{{.*}}[[PREFIX]]SetupArgument |
168 | // HIP: call{{.*}}[[PREFIX]]SetupArgument |
169 | // HIP: call{{.*}}[[PREFIX]]Launch |
170 | __global__ void kernelfunc(int i, int j, int k) {} |
171 | |
172 | // Test that we've built correct kernel launch sequence. |
173 | // LNX: define{{.*}}hostfunc |
174 | // CUDA-OLD: call{{.*}}[[PREFIX]]ConfigureCall |
175 | // CUDA-NEW: call{{.*}}__cudaPushCallConfiguration |
176 | // HIP: call{{.*}}[[PREFIX]]ConfigureCall |
177 | // LNX: call{{.*}}kernelfunc |
178 | void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } |
179 | #endif |
180 | |
181 | // Test that we've built a function to register kernels and global vars. |
182 | // ALL: define internal void @__[[PREFIX]]_register_globals |
183 | // ALL: call{{.*}}[[PREFIX]]RegisterFunction(i8** %0, {{.*}}kernelfunc{{[^,]*}}, {{[^@]*}}@0 |
184 | // ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}device_var{{[^,]*}}, {{[^@]*}}@1, {{.*}}i32 0, i32 4, i32 0, i32 0 |
185 | // ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}constant_var{{[^,]*}}, {{[^@]*}}@2, {{.*}}i32 0, i32 4, i32 1, i32 0 |
186 | // ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_device_var_def{{[^,]*}}, {{[^@]*}}@3, {{.*}}i32 0, i32 4, i32 0, i32 0 |
187 | // ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_constant_var_def{{[^,]*}}, {{[^@]*}}@4, {{.*}}i32 0, i32 4, i32 1, i32 0 |
188 | // ALL: ret void |
189 | |
190 | // Test that we've built a constructor. |
191 | // LNX: define internal void @__[[PREFIX]]_module_ctor |
192 | |
193 | // In separate mode it calls __[[PREFIX]]RegisterFatBinary(&__[[PREFIX]]_fatbin_wrapper) |
194 | // HIP only register fat binary once. |
195 | // HIP: load i8**, i8*** @__hip_gpubin_handle |
196 | // HIP-NEXT: icmp eq i8** {{.*}}, null |
197 | // HIP-NEXT: br i1 {{.*}}, label %if, label %exit |
198 | // HIP: if: |
199 | // NORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper |
200 | // .. stores return value in __[[PREFIX]]_gpubin_handle |
201 | // NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle |
202 | // .. and then calls __[[PREFIX]]_register_globals |
203 | // HIP-NEXT: br label %exit |
204 | // HIP: exit: |
205 | // HIP-NEXT: load i8**, i8*** @__hip_gpubin_handle |
206 | // NORDC-NEXT: call void @__[[PREFIX]]_register_globals |
207 | // * In separate mode we also register a destructor. |
208 | // NORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor) |
209 | |
210 | // With relocatable device code we call __[[PREFIX]]RegisterLinkedBinary%NVModuleID% |
211 | // RDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]]( |
212 | // RDC-SAME: __[[PREFIX]]_register_globals, {{.*}}__[[PREFIX]]_fatbin_wrapper |
213 | // RDC-SAME: [[MODULE_ID_GLOBAL]] |
214 | |
215 | // Test that we've created destructor. |
216 | // NORDC: define internal void @__[[PREFIX]]_module_dtor |
217 | // NORDC: load{{.*}}__[[PREFIX]]_gpubin_handle |
218 | // CUDANORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary |
219 | // HIP-NEXT: icmp ne i8** {{.*}}, null |
220 | // HIP-NEXT: br i1 {{.*}}, label %if, label %exit |
221 | // HIP: if: |
222 | // HIP-NEXT: call void @__[[PREFIX]]UnregisterFatBinary |
223 | // HIP-NEXT: store i8** null, i8*** @__hip_gpubin_handle |
224 | // HIP-NEXT: br label %exit |
225 | // HIP: exit: |
226 | |
227 | // There should be no __[[PREFIX]]_register_globals if we have no |
228 | // device-side globals, but we still need to register GPU binary. |
229 | // Skip GPU binary string first. |
230 | // CUDANOGLOBALS: @{{.*}} = private constant{{.*}} |
231 | // HIPNOGLOBALS: @{{.*}} = internal constant{{.*}} |
232 | // NOGLOBALS-NOT: define internal void @__{{.*}}_register_globals |
233 | // NOGLOBALS: define internal void @__[[PREFIX:cuda|hip]]_module_ctor |
234 | // NOGLOBALS: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper |
235 | // NOGLOBALS-NOT: call void @__[[PREFIX]]_register_globals |
236 | // NOGLOBALS: define internal void @__[[PREFIX]]_module_dtor |
237 | // NOGLOBALS: call void @__[[PREFIX]]UnregisterFatBinary |
238 | |
239 | // There should be no constructors/destructors if we have no GPU binary. |
240 | // NOGPUBIN-NOT: define internal void @__[[PREFIX]]_register_globals |
241 | // NOGPUBIN-NOT: define internal void @__[[PREFIX]]_module_ctor |
242 | // NOGPUBIN-NOT: define internal void @__[[PREFIX]]_module_dtor |
243 | |