1 | // Test target codegen - host bc file has to be created first. |
2 | // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc |
3 | // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 |
4 | // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc |
5 | // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 |
6 | // RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 |
7 | // expected-no-diagnostics |
8 | #ifndef HEADER |
9 | #define HEADER |
10 | |
11 | // CHECK-DAG: [[TEAM1_REDUCE_TY:%.+]] = type { [{{1024|2048}} x double] } |
12 | // CHECK-DAG: [[TEAM2_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i8], [{{1024|2048}} x float] } |
13 | // CHECK-DAG: [[TEAM3_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i32], [{{1024|2048}} x i16] } |
14 | // CHECK-DAG: [[TEAMS_REDUCE_UNION_TY:%.+]] = type { [[TEAM1_REDUCE_TY]] } |
15 | // CHECK-DAG: [[MAP_TY:%.+]] = type { [128 x i8] } |
16 | |
17 | // CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null |
18 | // CHECK-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1 |
19 | // CHECK-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1 |
20 | // CHECK-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} {{16|8}} |
21 | // CHECK-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} 16 |
22 | |
23 | // Check for the data transfer medium in shared memory to transfer the reduction list to the first warp. |
24 | // CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32] |
25 | |
26 | // Check that the execution mode of 2 target regions is set to Non-SPMD and the 3rd is in SPMD. |
27 | // CHECK-DAG: {{@__omp_offloading_.+l41}}_exec_mode = weak constant i8 1 |
28 | // CHECK-DAG: {{@__omp_offloading_.+l47}}_exec_mode = weak constant i8 1 |
29 | // CHECK-DAG: {{@__omp_offloading_.+l54}}_exec_mode = weak constant i8 0 |
30 | |
31 | // CHECK-DAG: [[TEAMS_RED_BUFFER:@.+]] = internal global [[TEAMS_REDUCE_UNION_TY]] zeroinitializer |
32 | |
33 | template<typename tx> |
34 | tx ftemplate(int n) { |
35 | int a; |
36 | short b; |
37 | tx c; |
38 | float d; |
39 | double e; |
40 | |
41 | #pragma omp target |
42 | #pragma omp teams reduction(+: e) |
43 | { |
44 | e += 5; |
45 | } |
46 | |
47 | #pragma omp target |
48 | #pragma omp teams reduction(^: c) reduction(*: d) |
49 | { |
50 | c ^= 2; |
51 | d *= 33; |
52 | } |
53 | |
54 | #pragma omp target |
55 | #pragma omp teams reduction(|: a) reduction(max: b) |
56 | #pragma omp parallel reduction(|: a) reduction(max: b) |
57 | { |
58 | a |= 1; |
59 | b = 99 > b ? 99 : b; |
60 | } |
61 | |
62 | return a+b+c+d+e; |
63 | } |
64 | |
65 | int bar(int n){ |
66 | int a = 0; |
67 | |
68 | a += ftemplate<char>(n); |
69 | |
70 | return a; |
71 | } |
72 | |
73 | // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l41}}_worker() |
74 | |
75 | // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l41]]( |
76 | // |
77 | // CHECK: {{call|invoke}} void [[T1]]_worker() |
78 | // |
79 | // CHECK: call void @__kmpc_kernel_init( |
80 | // |
81 | // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align |
82 | // CHECK: [[EV:%.+]] = load double, double* [[E]], align |
83 | // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5 |
84 | // CHECK: store double [[ADD]], double* [[E]], align |
85 | // CHECK: [[GEP1:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
86 | // CHECK: [[BC:%.+]] = bitcast double* [[E]] to i8* |
87 | // CHECK: store i8* [[BC]], i8** [[GEP1]], |
88 | // CHECK: [[BC_RED_LIST:%.+]] = bitcast [1 x i8*]* [[RED_LIST]] to i8* |
89 | // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* bitcast ([[TEAMS_REDUCE_UNION_TY]]* [[TEAMS_RED_BUFFER]] to i8*), i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]]) |
90 | // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 |
91 | // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] |
92 | // |
93 | // CHECK: [[IFLABEL]] |
94 | // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align |
95 | // CHECK: [[EV:%.+]] = load double, double* [[E]], align |
96 | // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]] |
97 | // CHECK: store double [[ADD]], double* [[E_IN]], align |
98 | // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[GTID]]) |
99 | // CHECK: br label %[[EXIT]] |
100 | // |
101 | // CHECK: [[EXIT]] |
102 | // CHECK: call void @__kmpc_kernel_deinit( |
103 | |
104 | // |
105 | // Reduction function |
106 | // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) |
107 | // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 |
108 | // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]], |
109 | // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double* |
110 | // |
111 | // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 |
112 | // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]], |
113 | // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double* |
114 | // |
115 | // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]], |
116 | // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]], |
117 | // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]] |
118 | // CHECK: store double [[RES]], double* [[VAR_LHS]], |
119 | // CHECK: ret void |
120 | |
121 | // |
122 | // Shuffle and reduce function |
123 | // CHECK: define internal void [[SHUFFLE_AND_REDUCE]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) |
124 | // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [1 x i8*], align |
125 | // CHECK: [[REMOTE_ELT:%.+]] = alloca double |
126 | // |
127 | // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align |
128 | // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align |
129 | // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align |
130 | // |
131 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 |
132 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
133 | // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 |
134 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* |
135 | // |
136 | // CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64* |
137 | // CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64* |
138 | // CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align |
139 | // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() |
140 | // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 |
141 | // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) |
142 | // |
143 | // CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align |
144 | // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8* |
145 | // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align |
146 | // |
147 | // Condition to reduce |
148 | // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 |
149 | // |
150 | // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 |
151 | // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] |
152 | // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] |
153 | // |
154 | // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 |
155 | // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 |
156 | // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 |
157 | // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] |
158 | // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 |
159 | // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] |
160 | // |
161 | // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] |
162 | // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] |
163 | // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] |
164 | // |
165 | // CHECK: [[DO_REDUCE]] |
166 | // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [1 x i8*]* [[RED_LIST]] to i8* |
167 | // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [1 x i8*]* [[REMOTE_RED_LIST]] to i8* |
168 | // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) |
169 | // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] |
170 | // |
171 | // CHECK: [[REDUCE_ELSE]] |
172 | // CHECK: br label {{%?}}[[REDUCE_CONT]] |
173 | // |
174 | // CHECK: [[REDUCE_CONT]] |
175 | // Now check if we should just copy over the remote reduction list |
176 | // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 |
177 | // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] |
178 | // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] |
179 | // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] |
180 | // |
181 | // CHECK: [[DO_COPY]] |
182 | // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 |
183 | // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], |
184 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 |
185 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
186 | // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double* |
187 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* |
188 | // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align |
189 | // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align |
190 | // CHECK: br label {{%?}}[[COPY_CONT:.+]] |
191 | // |
192 | // CHECK: [[COPY_ELSE]] |
193 | // CHECK: br label {{%?}}[[COPY_CONT]] |
194 | // |
195 | // CHECK: [[COPY_CONT]] |
196 | // CHECK: void |
197 | |
198 | // |
199 | // Inter warp copy function |
200 | // CHECK: define internal void [[INTER_WARP_COPY]](i8*, i32) |
201 | // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 |
202 | // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 |
203 | // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [1 x i8*]* |
204 | // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]], |
205 | // CHECK: br label |
206 | // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]], |
207 | // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2 |
208 | // CHECK: br i1 [[DONE_COPY]], label |
209 | // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ |
210 | // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 |
211 | // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] |
212 | // |
213 | // [[DO_COPY]] |
214 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 |
215 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
216 | // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* |
217 | // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]] |
218 | // |
219 | // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] |
220 | // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], |
221 | // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], |
222 | // CHECK: br label {{%?}}[[COPY_CONT:.+]] |
223 | // |
224 | // CHECK: [[COPY_ELSE]] |
225 | // CHECK: br label {{%?}}[[COPY_CONT]] |
226 | // |
227 | // Barrier after copy to shared memory storage medium. |
228 | // CHECK: [[COPY_CONT]] |
229 | // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ |
230 | // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* |
231 | // |
232 | // Read into warp 0. |
233 | // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] |
234 | // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] |
235 | // |
236 | // CHECK: [[DO_READ]] |
237 | // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] |
238 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 |
239 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
240 | // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32* |
241 | // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]] |
242 | // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], |
243 | // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], |
244 | // CHECK: br label {{%?}}[[READ_CONT:.+]] |
245 | // |
246 | // CHECK: [[READ_ELSE]] |
247 | // CHECK: br label {{%?}}[[READ_CONT]] |
248 | // |
249 | // CHECK: [[READ_CONT]] |
250 | // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1 |
251 | // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]], |
252 | // CHECK: br label |
253 | // CHECK: ret |
254 | |
255 | // CHECK: define internal void [[RED_LIST_TO_GLOBAL_COPY]](i8*, i32, i8*) |
256 | // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, |
257 | // CHECK: [[IDX_PTR:%.+]] = alloca i32, |
258 | // CHECK: [[RL_PTR:%.+]] = alloca i8*, |
259 | // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], |
260 | // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], |
261 | // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], |
262 | // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], |
263 | // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [1 x i8*]* |
264 | // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], |
265 | // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]* |
266 | // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], |
267 | // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
268 | // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], |
269 | // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to double* |
270 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
271 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
272 | // CHECK: [[LOC_RED1:%.+]] = load double, double* [[RL_RED1]], |
273 | // CHECK: store double [[LOC_RED1]], double* [[GLOBAL_RED1_IDX_PTR]], |
274 | // CHECK: ret void |
275 | |
276 | // CHECK: define internal void [[RED_LIST_TO_GLOBAL_RED]](i8*, i32, i8*) |
277 | // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, |
278 | // CHECK: [[IDX_PTR:%.+]] = alloca i32, |
279 | // CHECK: [[RL_PTR:%.+]] = alloca i8*, |
280 | // CHECK: [[LOCAL_RL:%.+]] = alloca [1 x i8*], |
281 | // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], |
282 | // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], |
283 | // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], |
284 | // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], |
285 | // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]* |
286 | // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], |
287 | // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
288 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
289 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
290 | // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast double* [[GLOBAL_RED1_IDX_PTR]] to i8* |
291 | // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] |
292 | // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [1 x i8*]* [[LOCAL_RL]] to i8* |
293 | // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], |
294 | // CHECK: call void [[REDUCTION_FUNC]](i8* [[LOCAL_RL_BC]], i8* [[RL_BC]]) |
295 | // CHECK: ret void |
296 | |
297 | // CHECK: define internal void [[GLOBAL_TO_RED_LIST_COPY]](i8*, i32, i8*) |
298 | // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, |
299 | // CHECK: [[IDX_PTR:%.+]] = alloca i32, |
300 | // CHECK: [[RL_PTR:%.+]] = alloca i8*, |
301 | // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], |
302 | // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], |
303 | // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], |
304 | // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], |
305 | // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [1 x i8*]* |
306 | // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], |
307 | // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]* |
308 | // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], |
309 | // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
310 | // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], |
311 | // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to double* |
312 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
313 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
314 | // CHECK: [[GLOBAL_RED1:%.+]] = load double, double* [[GLOBAL_RED1_IDX_PTR]], |
315 | // CHECK: store double [[GLOBAL_RED1]], double* [[RL_RED1]], |
316 | // CHECK: ret void |
317 | |
318 | // CHECK: define internal void [[GLOBAL_TO_RED_LIST_RED]](i8*, i32, i8*) |
319 | // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, |
320 | // CHECK: [[IDX_PTR:%.+]] = alloca i32, |
321 | // CHECK: [[RL_PTR:%.+]] = alloca i8*, |
322 | // CHECK: [[LOCAL_RL:%.+]] = alloca [1 x i8*], |
323 | // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], |
324 | // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], |
325 | // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], |
326 | // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], |
327 | // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]* |
328 | // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], |
329 | // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
330 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
331 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
332 | // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast double* [[GLOBAL_RED1_IDX_PTR]] to i8* |
333 | // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] |
334 | // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [1 x i8*]* [[LOCAL_RL]] to i8* |
335 | // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], |
336 | // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]]) |
337 | // CHECK: ret void |
338 | |
339 | // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l47}}_worker() |
340 | |
341 | // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l47]]( |
342 | // |
343 | // CHECK: {{call|invoke}} void [[T2]]_worker() |
344 | |
345 | // |
346 | // CHECK: call void @__kmpc_kernel_init( |
347 | // |
348 | // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align |
349 | // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align |
350 | // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32 |
351 | // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2 |
352 | // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 |
353 | // CHECK: store i8 [[TRUNC]], i8* [[C]], align |
354 | // CHECK: [[DV:%.+]] = load float, float* [[D]], align |
355 | // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}} |
356 | // CHECK: store float [[MUL]], float* [[D]], align |
357 | // CHECK: [[GEP1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
358 | // CHECK: store i8* [[C]], i8** [[GEP1]], |
359 | // CHECK: [[GEP2:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
360 | // CHECK: [[BC:%.+]] = bitcast float* [[D]] to i8* |
361 | // CHECK: store i8* [[BC]], i8** [[GEP2]], |
362 | // CHECK: [[BC_RED_LIST:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8* |
363 | // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* bitcast ([[TEAMS_REDUCE_UNION_TY]]* [[TEAMS_RED_BUFFER]] to i8*), i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]]) |
364 | // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 |
365 | // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] |
366 | // |
367 | // CHECK: [[IFLABEL]] |
368 | // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align |
369 | // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32 |
370 | // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align |
371 | // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32 |
372 | // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]] |
373 | // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 |
374 | // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align |
375 | // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align |
376 | // CHECK: [[DV:%.+]] = load float, float* [[D]], align |
377 | // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]] |
378 | // CHECK: store float [[MUL]], float* [[D_IN]], align |
379 | // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[GTID]]) |
380 | // CHECK: br label %[[EXIT]] |
381 | // |
382 | // CHECK: [[EXIT]] |
383 | // CHECK: call void @__kmpc_kernel_deinit( |
384 | |
385 | // |
386 | // Reduction function |
387 | // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) |
388 | // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 |
389 | // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], |
390 | // |
391 | // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 |
392 | // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], |
393 | // |
394 | // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1 |
395 | // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], |
396 | // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float* |
397 | // |
398 | // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1 |
399 | // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], |
400 | // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float* |
401 | // |
402 | // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]], |
403 | // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32 |
404 | // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]], |
405 | // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32 |
406 | // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] |
407 | // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8 |
408 | // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]], |
409 | // |
410 | // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]], |
411 | // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]], |
412 | // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] |
413 | // CHECK: store float [[RES]], float* [[VAR2_LHS]], |
414 | // CHECK: ret void |
415 | |
416 | // |
417 | // Shuffle and reduce function |
418 | // CHECK: define internal void [[SHUFFLE_AND_REDUCE]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) |
419 | // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [2 x i8*], align |
420 | // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8 |
421 | // CHECK: [[REMOTE_ELT2:%.+]] = alloca float |
422 | // |
423 | // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align |
424 | // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align |
425 | // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align |
426 | // |
427 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 |
428 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
429 | // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 |
430 | // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align |
431 | // |
432 | // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32 |
433 | // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() |
434 | // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 |
435 | // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) |
436 | // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8 |
437 | // |
438 | // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align |
439 | // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align |
440 | // |
441 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 |
442 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
443 | // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 |
444 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* |
445 | // |
446 | // CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32* |
447 | // CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32* |
448 | // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align |
449 | // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() |
450 | // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 |
451 | // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) |
452 | // |
453 | // CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align |
454 | // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8* |
455 | // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align |
456 | // |
457 | // Condition to reduce |
458 | // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 |
459 | // |
460 | // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 |
461 | // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] |
462 | // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] |
463 | // |
464 | // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 |
465 | // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 |
466 | // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 |
467 | // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] |
468 | // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 |
469 | // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] |
470 | // |
471 | // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] |
472 | // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] |
473 | // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] |
474 | // |
475 | // CHECK: [[DO_REDUCE]] |
476 | // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8* |
477 | // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [2 x i8*]* [[REMOTE_RED_LIST]] to i8* |
478 | // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) |
479 | // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] |
480 | // |
481 | // CHECK: [[REDUCE_ELSE]] |
482 | // CHECK: br label {{%?}}[[REDUCE_CONT]] |
483 | // |
484 | // CHECK: [[REDUCE_CONT]] |
485 | // Now check if we should just copy over the remote reduction list |
486 | // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 |
487 | // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] |
488 | // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] |
489 | // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] |
490 | // |
491 | // CHECK: [[DO_COPY]] |
492 | // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 |
493 | // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], |
494 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 |
495 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
496 | // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align |
497 | // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align |
498 | // |
499 | // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 |
500 | // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], |
501 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 |
502 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
503 | // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float* |
504 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* |
505 | // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align |
506 | // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align |
507 | // CHECK: br label {{%?}}[[COPY_CONT:.+]] |
508 | // |
509 | // CHECK: [[COPY_ELSE]] |
510 | // CHECK: br label {{%?}}[[COPY_CONT]] |
511 | // |
512 | // CHECK: [[COPY_CONT]] |
513 | // CHECK: void |
514 | |
515 | // |
516 | // Inter warp copy function |
517 | // CHECK: define internal void [[INTER_WARP_COPY]](i8*, i32) |
518 | // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 |
519 | // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 |
520 | // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [2 x i8*]* |
521 | // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ |
522 | // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 |
523 | // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] |
524 | // |
525 | // [[DO_COPY]] |
526 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 |
527 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
528 | // |
529 | // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] |
530 | // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* |
531 | // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align |
532 | // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align |
533 | // CHECK: br label {{%?}}[[COPY_CONT:.+]] |
534 | // |
535 | // CHECK: [[COPY_ELSE]] |
536 | // CHECK: br label {{%?}}[[COPY_CONT]] |
537 | // |
538 | // Barrier after copy to shared memory storage medium. |
539 | // CHECK: [[COPY_CONT]] |
540 | // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ |
541 | // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* |
542 | // |
543 | // Read into warp 0. |
544 | // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] |
545 | // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] |
546 | // |
547 | // CHECK: [[DO_READ]] |
548 | // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] |
549 | // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])* |
550 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 |
551 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
552 | // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align |
553 | // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align |
554 | // CHECK: br label {{%?}}[[READ_CONT:.+]] |
555 | // |
556 | // CHECK: [[READ_ELSE]] |
557 | // CHECK: br label {{%?}}[[READ_CONT]] |
558 | // |
559 | // CHECK: [[READ_CONT]] |
560 | // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ |
561 | // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 |
562 | // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] |
563 | // |
564 | // [[DO_COPY]] |
565 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 |
566 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
567 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* |
568 | // |
569 | // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] |
570 | // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align |
571 | // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align |
572 | // CHECK: br label {{%?}}[[COPY_CONT:.+]] |
573 | // |
574 | // CHECK: [[COPY_ELSE]] |
575 | // CHECK: br label {{%?}}[[COPY_CONT]] |
576 | // |
577 | // Barrier after copy to shared memory storage medium. |
578 | // CHECK: [[COPY_CONT]] |
579 | // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ |
580 | // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* |
581 | // |
582 | // Read into warp 0. |
583 | // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] |
584 | // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] |
585 | // |
586 | // CHECK: [[DO_READ]] |
587 | // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] |
588 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 |
589 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
590 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* |
591 | // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align |
592 | // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align |
593 | // CHECK: br label {{%?}}[[READ_CONT:.+]] |
594 | // |
595 | // CHECK: [[READ_ELSE]] |
596 | // CHECK: br label {{%?}}[[READ_CONT]] |
597 | // |
598 | // CHECK: [[READ_CONT]] |
599 | // CHECK: ret |
600 | |
601 | // CHECK: define internal void [[RED_LIST_TO_GLOBAL_COPY]](i8*, i32, i8*) |
602 | // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, |
603 | // CHECK: [[IDX_PTR:%.+]] = alloca i32, |
604 | // CHECK: [[RL_PTR:%.+]] = alloca i8*, |
605 | // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], |
606 | // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], |
607 | // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], |
608 | // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], |
609 | // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]* |
610 | // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], |
611 | // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]* |
612 | // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], |
613 | // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
614 | // CHECK: [[RL_RED1:%.+]] = load i8*, i8** [[RL_RED1_PTR]], |
615 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
616 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
617 | // CHECK: [[LOC_RED1:%.+]] = load i8, i8* [[RL_RED1]], |
618 | // CHECK: store i8 [[LOC_RED1]], i8* [[GLOBAL_RED1_IDX_PTR]], |
619 | // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
620 | // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], |
621 | // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to float* |
622 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
623 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
624 | // CHECK: [[LOC_RED1:%.+]] = load float, float* [[RL_RED1]], |
625 | // CHECK: store float [[LOC_RED1]], float* [[GLOBAL_RED1_IDX_PTR]], |
626 | // CHECK: ret void |
627 | |
628 | // CHECK: define internal void [[RED_LIST_TO_GLOBAL_RED]](i8*, i32, i8*) |
629 | // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, |
630 | // CHECK: [[IDX_PTR:%.+]] = alloca i32, |
631 | // CHECK: [[RL_PTR:%.+]] = alloca i8*, |
632 | // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*], |
633 | // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], |
634 | // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], |
635 | // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], |
636 | // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], |
637 | // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]* |
638 | // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], |
639 | // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
640 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
641 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
642 | // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR]], i8** [[LOCAL_RL_RED1_PTR]] |
643 | // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
644 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
645 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
646 | // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast float* [[GLOBAL_RED1_IDX_PTR]] to i8* |
647 | // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] |
648 | // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8* |
649 | // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], |
650 | // CHECK: call void [[REDUCTION_FUNC]](i8* [[LOCAL_RL_BC]], i8* [[RL_BC]]) |
651 | // CHECK: ret void |
652 | |
653 | // CHECK: define internal void [[GLOBAL_TO_RED_LIST_COPY]](i8*, i32, i8*) |
654 | // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, |
655 | // CHECK: [[IDX_PTR:%.+]] = alloca i32, |
656 | // CHECK: [[RL_PTR:%.+]] = alloca i8*, |
657 | // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], |
658 | // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], |
659 | // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], |
660 | // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], |
661 | // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]* |
662 | // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], |
663 | // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]* |
664 | // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], |
665 | // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
666 | // CHECK: [[RL_RED1:%.+]] = load i8*, i8** [[RL_RED1_PTR]], |
667 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
668 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
669 | // CHECK: [[GLOBAL_RED1:%.+]] = load i8, i8* [[GLOBAL_RED1_IDX_PTR]], |
670 | // CHECK: store i8 [[GLOBAL_RED1]], i8* [[RL_RED1]], |
671 | // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
672 | // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], |
673 | // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to float* |
674 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
675 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
676 | // CHECK: [[GLOBAL_RED1:%.+]] = load float, float* [[GLOBAL_RED1_IDX_PTR]], |
677 | // CHECK: store float [[GLOBAL_RED1]], float* [[RL_RED1]], |
678 | // CHECK: ret void |
679 | |
680 | // CHECK: define internal void [[GLOBAL_TO_RED_LIST_RED]](i8*, i32, i8*) |
681 | // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, |
682 | // CHECK: [[IDX_PTR:%.+]] = alloca i32, |
683 | // CHECK: [[RL_PTR:%.+]] = alloca i8*, |
684 | // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*], |
685 | // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], |
686 | // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], |
687 | // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], |
688 | // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], |
689 | // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]* |
690 | // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], |
691 | // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
692 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
693 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
694 | // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR]], i8** [[LOCAL_RL_RED1_PTR]] |
695 | // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
696 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
697 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
698 | // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast float* [[GLOBAL_RED1_IDX_PTR]] to i8* |
699 | // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] |
700 | // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8* |
701 | // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], |
702 | // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]]) |
703 | // CHECK: ret void |
704 | |
705 | // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l54}}( |
706 | // |
707 | // CHECK: call void @__kmpc_spmd_kernel_init( |
708 | // CHECK: call void @__kmpc_data_sharing_init_stack_spmd() |
709 | // CHECK-NOT: call void @__kmpc_get_team_static_memory |
710 | // CHECK: store i32 0, |
711 | // CHECK: store i32 0, i32* [[A_ADDR:%.+]], align |
712 | // CHECK: store i16 -32768, i16* [[B_ADDR:%.+]], align |
713 | // CHECK: call void [[OUTLINED:@.+]](i32* {{.+}}, i32* {{.+}}, i32* [[A_ADDR]], i16* [[B_ADDR]]) |
714 | // CHECK: [[GEP1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
715 | // CHECK: [[BC:%.+]] = bitcast i32* [[A_ADDR]] to i8* |
716 | // CHECK: store i8* [[BC]], i8** [[GEP1]], |
717 | // CHECK: [[GEP2:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
718 | // CHECK: [[BC:%.+]] = bitcast i16* [[B_ADDR]] to i8* |
719 | // CHECK: store i8* [[BC]], i8** [[GEP2]], |
720 | // CHECK: [[BC_RED_LIST:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8* |
721 | // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* bitcast ([[TEAMS_REDUCE_UNION_TY]]* [[TEAMS_RED_BUFFER]] to i8*), i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]]) |
722 | // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 |
723 | // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] |
724 | // |
725 | // CHECK: [[IFLABEL]] |
726 | // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align |
727 | // CHECK: [[AV:%.+]] = load i32, i32* [[A_ADDR]], align |
728 | // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] |
729 | // CHECK: store i32 [[OR]], i32* [[A_IN]], align |
730 | // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align |
731 | // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 |
732 | // CHECK: [[BV16:%.+]] = load i16, i16* [[B_ADDR]], align |
733 | // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 |
734 | // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] |
735 | // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] |
736 | // |
737 | // CHECK: [[DO_MAX]] |
738 | // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align |
739 | // CHECK: br label {{%?}}[[MAX_CONT:.+]] |
740 | // |
741 | // CHECK: [[MAX_ELSE]] |
742 | // CHECK: [[MAX2:%.+]] = load i16, i16* [[B_ADDR]], align |
743 | // CHECK: br label {{%?}}[[MAX_CONT]] |
744 | // |
745 | // CHECK: [[MAX_CONT]] |
746 | // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] |
747 | // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align |
748 | // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[GTID]]) |
749 | // CHECK: br label %[[EXIT]] |
750 | // |
751 | // CHECK: [[EXIT]] |
752 | // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) |
753 | |
754 | // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.+}}, i16* dereferenceable{{.+}}) |
755 | // |
756 | // CHECK: store i32 0, i32* [[A:%.+]], align |
757 | // CHECK: store i16 -32768, i16* [[B:%.+]], align |
758 | // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align |
759 | // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1 |
760 | // CHECK: store i32 [[OR]], i32* [[A]], align |
761 | // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align |
762 | // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 |
763 | // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]] |
764 | // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] |
765 | // |
766 | // CHECK: [[DO_MAX]] |
767 | // CHECK: br label {{%?}}[[MAX_CONT:.+]] |
768 | // |
769 | // CHECK: [[MAX_ELSE]] |
770 | // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align |
771 | // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32 |
772 | // CHECK: br label {{%?}}[[MAX_CONT]] |
773 | // |
774 | // CHECK: [[MAX_CONT]] |
775 | // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ] |
776 | // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16 |
777 | // CHECK: store i16 [[TRUNC]], i16* [[B]], align |
778 | // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{.+}} 0, i[[SZ:.+]] 0 |
779 | // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8* |
780 | // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align |
781 | // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1 |
782 | // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8* |
783 | // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align |
784 | // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* |
785 | // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* [[LOC]], i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[PAR_SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[PAR_WARP_COPY_FN:@.+]]) |
786 | // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 |
787 | // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] |
788 | // |
789 | // CHECK: [[IFLABEL]] |
790 | // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align |
791 | // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align |
792 | // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] |
793 | // CHECK: store i32 [[OR]], i32* [[A_IN]], align |
794 | // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align |
795 | // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 |
796 | // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align |
797 | // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 |
798 | // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] |
799 | // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] |
800 | // |
801 | // CHECK: [[DO_MAX]] |
802 | // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align |
803 | // CHECK: br label {{%?}}[[MAX_CONT:.+]] |
804 | // |
805 | // CHECK: [[MAX_ELSE]] |
806 | // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align |
807 | // CHECK: br label {{%?}}[[MAX_CONT]] |
808 | // |
809 | // CHECK: [[MAX_CONT]] |
810 | // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] |
811 | // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align |
812 | // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( |
813 | // CHECK: br label %[[EXIT]] |
814 | // |
815 | // CHECK: [[EXIT]] |
816 | // CHECK: ret void |
817 | |
818 | // |
819 | // Reduction function |
820 | // CHECK: define internal void [[PAR_REDUCTION_FUNC:@.+]](i8*, i8*) |
821 | // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 |
822 | // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], |
823 | // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32* |
824 | // |
825 | // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 |
826 | // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], |
827 | // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32* |
828 | // |
829 | // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1 |
830 | // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], |
831 | // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16* |
832 | // |
833 | // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1 |
834 | // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], |
835 | // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16* |
836 | // |
837 | // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]], |
838 | // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]], |
839 | // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] |
840 | // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]], |
841 | // |
842 | // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]], |
843 | // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32 |
844 | // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]], |
845 | // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32 |
846 | // |
847 | // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] |
848 | // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] |
849 | // |
850 | // CHECK: [[DO_MAX]] |
851 | // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align |
852 | // CHECK: br label {{%?}}[[MAX_CONT:.+]] |
853 | // |
854 | // CHECK: [[MAX_ELSE]] |
855 | // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align |
856 | // CHECK: br label {{%?}}[[MAX_CONT]] |
857 | // |
858 | // CHECK: [[MAX_CONT]] |
859 | // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] |
860 | // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]], |
861 | // CHECK: ret void |
862 | // |
863 | // Shuffle and reduce function |
864 | // CHECK: define internal void [[PAR_SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) |
865 | // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align |
866 | // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 |
867 | // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 |
868 | // |
869 | // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align |
870 | // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align |
871 | // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align |
872 | // |
873 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 |
874 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
875 | // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 |
876 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* |
877 | // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align |
878 | // |
879 | // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() |
880 | // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 |
881 | // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) |
882 | // |
883 | // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align |
884 | // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* |
885 | // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align |
886 | // |
887 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 |
888 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
889 | // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 |
890 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* |
891 | // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align |
892 | // |
893 | // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32 |
894 | // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() |
895 | // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 |
896 | // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) |
897 | // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16 |
898 | // |
899 | // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align |
900 | // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* |
901 | // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align |
902 | // |
903 | // Condition to reduce |
904 | // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 |
905 | // |
906 | // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 |
907 | // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] |
908 | // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] |
909 | // |
910 | // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 |
911 | // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 |
912 | // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 |
913 | // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] |
914 | // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 |
915 | // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] |
916 | // |
917 | // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] |
918 | // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] |
919 | // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] |
920 | // |
921 | // CHECK: [[DO_REDUCE]] |
922 | // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* |
923 | // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* |
924 | // CHECK: call void [[PAR_REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) |
925 | // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] |
926 | // |
927 | // CHECK: [[REDUCE_ELSE]] |
928 | // CHECK: br label {{%?}}[[REDUCE_CONT]] |
929 | // |
930 | // CHECK: [[REDUCE_CONT]] |
931 | // Now check if we should just copy over the remote reduction list |
932 | // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 |
933 | // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] |
934 | // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] |
935 | // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] |
936 | // |
937 | // CHECK: [[DO_COPY]] |
938 | // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 |
939 | // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], |
940 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 |
941 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
942 | // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* |
943 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* |
944 | // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align |
945 | // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align |
946 | // |
947 | // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 |
948 | // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], |
949 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 |
950 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
951 | // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* |
952 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* |
953 | // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align |
954 | // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align |
955 | // CHECK: br label {{%?}}[[COPY_CONT:.+]] |
956 | // |
957 | // CHECK: [[COPY_ELSE]] |
958 | // CHECK: br label {{%?}}[[COPY_CONT]] |
959 | // |
960 | // CHECK: [[COPY_CONT]] |
961 | // CHECK: void |
962 | |
963 | // |
964 | // Inter warp copy function |
965 | // CHECK: define internal void [[PAR_WARP_COPY_FN]](i8*, i32) |
966 | // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 |
967 | // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 |
968 | // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* |
969 | // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 |
970 | // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] |
971 | // |
972 | // [[DO_COPY]] |
973 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 |
974 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
975 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* |
976 | // |
977 | // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] |
978 | // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align |
979 | // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align |
980 | // CHECK: br label {{%?}}[[COPY_CONT:.+]] |
981 | // |
982 | // CHECK: [[COPY_ELSE]] |
983 | // CHECK: br label {{%?}}[[COPY_CONT]] |
984 | // |
985 | // Barrier after copy to shared memory storage medium. |
986 | // CHECK: [[COPY_CONT]] |
987 | // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ |
988 | // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* |
989 | // |
990 | // Read into warp 0. |
991 | // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] |
992 | // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] |
993 | // |
994 | // CHECK: [[DO_READ]] |
995 | // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] |
996 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 |
997 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
998 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* |
999 | // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align |
1000 | // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align |
1001 | // CHECK: br label {{%?}}[[READ_CONT:.+]] |
1002 | // |
1003 | // CHECK: [[READ_ELSE]] |
1004 | // CHECK: br label {{%?}}[[READ_CONT]] |
1005 | // |
1006 | // CHECK: [[READ_CONT]] |
1007 | // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ |
1008 | // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 |
1009 | // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] |
1010 | // |
1011 | // [[DO_COPY]] |
1012 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 |
1013 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
1014 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* |
1015 | // |
1016 | // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] |
1017 | // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* |
1018 | // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align |
1019 | // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align |
1020 | // CHECK: br label {{%?}}[[COPY_CONT:.+]] |
1021 | // |
1022 | // CHECK: [[COPY_ELSE]] |
1023 | // CHECK: br label {{%?}}[[COPY_CONT]] |
1024 | // |
1025 | // Barrier after copy to shared memory storage medium. |
1026 | // CHECK: [[COPY_CONT]] |
1027 | // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ |
1028 | // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* |
1029 | // |
1030 | // Read into warp 0. |
1031 | // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] |
1032 | // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] |
1033 | // |
1034 | // CHECK: [[DO_READ]] |
1035 | // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] |
1036 | // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* |
1037 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 |
1038 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
1039 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* |
1040 | // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align |
1041 | // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align |
1042 | // CHECK: br label {{%?}}[[READ_CONT:.+]] |
1043 | // |
1044 | // CHECK: [[READ_ELSE]] |
1045 | // CHECK: br label {{%?}}[[READ_CONT]] |
1046 | // |
1047 | // CHECK: [[READ_CONT]] |
1048 | // CHECK: ret |
1049 | |
1050 | // |
1051 | // Reduction function |
1052 | // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) |
1053 | // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 |
1054 | // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], |
1055 | // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32* |
1056 | // |
1057 | // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 |
1058 | // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], |
1059 | // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32* |
1060 | // |
1061 | // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1 |
1062 | // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], |
1063 | // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16* |
1064 | // |
1065 | // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1 |
1066 | // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], |
1067 | // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16* |
1068 | // |
1069 | // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]], |
1070 | // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]], |
1071 | // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] |
1072 | // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]], |
1073 | // |
1074 | // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]], |
1075 | // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32 |
1076 | // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]], |
1077 | // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32 |
1078 | // |
1079 | // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] |
1080 | // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] |
1081 | // |
1082 | // CHECK: [[DO_MAX]] |
1083 | // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align |
1084 | // CHECK: br label {{%?}}[[MAX_CONT:.+]] |
1085 | // |
1086 | // CHECK: [[MAX_ELSE]] |
1087 | // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align |
1088 | // CHECK: br label {{%?}}[[MAX_CONT]] |
1089 | // |
1090 | // CHECK: [[MAX_CONT]] |
1091 | // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] |
1092 | // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]], |
1093 | // CHECK: ret void |
1094 | |
1095 | // |
1096 | // Shuffle and reduce function |
1097 | // CHECK: define internal void [[SHUFFLE_AND_REDUCE]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) |
1098 | // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [2 x i8*], align |
1099 | // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 |
1100 | // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 |
1101 | // |
1102 | // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align |
1103 | // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align |
1104 | // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align |
1105 | // |
1106 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 |
1107 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
1108 | // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 |
1109 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* |
1110 | // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align |
1111 | // |
1112 | // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() |
1113 | // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 |
1114 | // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) |
1115 | // |
1116 | // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align |
1117 | // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* |
1118 | // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align |
1119 | // |
1120 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 |
1121 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
1122 | // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 |
1123 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* |
1124 | // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align |
1125 | // |
1126 | // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32 |
1127 | // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() |
1128 | // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 |
1129 | // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) |
1130 | // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16 |
1131 | // |
1132 | // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align |
1133 | // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* |
1134 | // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align |
1135 | // |
1136 | // Condition to reduce |
1137 | // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 |
1138 | // |
1139 | // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 |
1140 | // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] |
1141 | // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] |
1142 | // |
1143 | // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 |
1144 | // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 |
1145 | // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 |
1146 | // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] |
1147 | // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 |
1148 | // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] |
1149 | // |
1150 | // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] |
1151 | // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] |
1152 | // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] |
1153 | // |
1154 | // CHECK: [[DO_REDUCE]] |
1155 | // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8* |
1156 | // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [2 x i8*]* [[REMOTE_RED_LIST]] to i8* |
1157 | // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) |
1158 | // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] |
1159 | // |
1160 | // CHECK: [[REDUCE_ELSE]] |
1161 | // CHECK: br label {{%?}}[[REDUCE_CONT]] |
1162 | // |
1163 | // CHECK: [[REDUCE_CONT]] |
1164 | // Now check if we should just copy over the remote reduction list |
1165 | // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 |
1166 | // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] |
1167 | // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] |
1168 | // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] |
1169 | // |
1170 | // CHECK: [[DO_COPY]] |
1171 | // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 |
1172 | // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], |
1173 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 |
1174 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
1175 | // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* |
1176 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* |
1177 | // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align |
1178 | // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align |
1179 | // |
1180 | // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 |
1181 | // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], |
1182 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 |
1183 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
1184 | // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* |
1185 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* |
1186 | // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align |
1187 | // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align |
1188 | // CHECK: br label {{%?}}[[COPY_CONT:.+]] |
1189 | // |
1190 | // CHECK: [[COPY_ELSE]] |
1191 | // CHECK: br label {{%?}}[[COPY_CONT]] |
1192 | // |
1193 | // CHECK: [[COPY_CONT]] |
1194 | // CHECK: void |
1195 | |
1196 | // |
1197 | // Inter warp copy function |
1198 | // CHECK: define internal void [[INTER_WARP_COPY]](i8*, i32) |
1199 | // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 |
1200 | // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 |
1201 | // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* |
1202 | // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ |
1203 | // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 |
1204 | // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] |
1205 | // |
1206 | // [[DO_COPY]] |
1207 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 |
1208 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
1209 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* |
1210 | // |
1211 | // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] |
1212 | // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align |
1213 | // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align |
1214 | // CHECK: br label {{%?}}[[COPY_CONT:.+]] |
1215 | // |
1216 | // CHECK: [[COPY_ELSE]] |
1217 | // CHECK: br label {{%?}}[[COPY_CONT]] |
1218 | // |
1219 | // Barrier after copy to shared memory storage medium. |
1220 | // CHECK: [[COPY_CONT]] |
1221 | // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ |
1222 | // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* |
1223 | // |
1224 | // Read into warp 0. |
1225 | // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] |
1226 | // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] |
1227 | // |
1228 | // CHECK: [[DO_READ]] |
1229 | // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] |
1230 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 |
1231 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
1232 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* |
1233 | // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align |
1234 | // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align |
1235 | // CHECK: br label {{%?}}[[READ_CONT:.+]] |
1236 | // |
1237 | // CHECK: [[READ_ELSE]] |
1238 | // CHECK: br label {{%?}}[[READ_CONT]] |
1239 | // |
1240 | // CHECK: [[READ_CONT]] |
1241 | // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ |
1242 | // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 |
1243 | // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] |
1244 | // |
1245 | // [[DO_COPY]] |
1246 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 |
1247 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
1248 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* |
1249 | // |
1250 | // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] |
1251 | // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* |
1252 | // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align |
1253 | // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align |
1254 | // CHECK: br label {{%?}}[[COPY_CONT:.+]] |
1255 | // |
1256 | // CHECK: [[COPY_ELSE]] |
1257 | // CHECK: br label {{%?}}[[COPY_CONT]] |
1258 | // |
1259 | // Barrier after copy to shared memory storage medium. |
1260 | // CHECK: [[COPY_CONT]] |
1261 | // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ |
1262 | // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* |
1263 | // |
1264 | // Read into warp 0. |
1265 | // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] |
1266 | // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] |
1267 | // |
1268 | // CHECK: [[DO_READ]] |
1269 | // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] |
1270 | // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* |
1271 | // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 |
1272 | // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], |
1273 | // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* |
1274 | // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align |
1275 | // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align |
1276 | // CHECK: br label {{%?}}[[READ_CONT:.+]] |
1277 | // |
1278 | // CHECK: [[READ_ELSE]] |
1279 | // CHECK: br label {{%?}}[[READ_CONT]] |
1280 | // |
1281 | // CHECK: [[READ_CONT]] |
1282 | // CHECK: ret |
1283 | |
1284 | // CHECK: define internal void [[RED_LIST_TO_GLOBAL_COPY]](i8*, i32, i8*) |
1285 | // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, |
1286 | // CHECK: [[IDX_PTR:%.+]] = alloca i32, |
1287 | // CHECK: [[RL_PTR:%.+]] = alloca i8*, |
1288 | // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], |
1289 | // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], |
1290 | // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], |
1291 | // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], |
1292 | // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]* |
1293 | // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], |
1294 | // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]* |
1295 | // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], |
1296 | // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
1297 | // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], |
1298 | // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i32* |
1299 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
1300 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
1301 | // CHECK: [[LOC_RED1:%.+]] = load i32, i32* [[RL_RED1]], |
1302 | // CHECK: store i32 [[LOC_RED1]], i32* [[GLOBAL_RED1_IDX_PTR]], |
1303 | // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
1304 | // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], |
1305 | // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i16* |
1306 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
1307 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
1308 | // CHECK: [[LOC_RED1:%.+]] = load i16, i16* [[RL_RED1]], |
1309 | // CHECK: store i16 [[LOC_RED1]], i16* [[GLOBAL_RED1_IDX_PTR]], |
1310 | // CHECK: ret void |
1311 | |
1312 | // CHECK: define internal void [[RED_LIST_TO_GLOBAL_RED]](i8*, i32, i8*) |
1313 | // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, |
1314 | // CHECK: [[IDX_PTR:%.+]] = alloca i32, |
1315 | // CHECK: [[RL_PTR:%.+]] = alloca i8*, |
1316 | // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*], |
1317 | // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], |
1318 | // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], |
1319 | // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], |
1320 | // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], |
1321 | // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]* |
1322 | // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], |
1323 | // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
1324 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
1325 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
1326 | // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i32* [[GLOBAL_RED1_IDX_PTR]] to i8* |
1327 | // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] |
1328 | // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
1329 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
1330 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
1331 | // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i16* [[GLOBAL_RED1_IDX_PTR]] to i8* |
1332 | // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] |
1333 | // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8* |
1334 | // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], |
1335 | // CHECK: call void [[REDUCTION_FUNC]](i8* [[LOCAL_RL_BC]], i8* [[RL_BC]]) |
1336 | // CHECK: ret void |
1337 | |
1338 | // CHECK: define internal void [[GLOBAL_TO_RED_LIST_COPY]](i8*, i32, i8*) |
1339 | // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, |
1340 | // CHECK: [[IDX_PTR:%.+]] = alloca i32, |
1341 | // CHECK: [[RL_PTR:%.+]] = alloca i8*, |
1342 | // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], |
1343 | // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], |
1344 | // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], |
1345 | // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], |
1346 | // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]* |
1347 | // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], |
1348 | // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]* |
1349 | // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], |
1350 | // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
1351 | // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], |
1352 | // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i32* |
1353 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
1354 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
1355 | // CHECK: [[GLOBAL_RED1:%.+]] = load i32, i32* [[GLOBAL_RED1_IDX_PTR]], |
1356 | // CHECK: store i32 [[GLOBAL_RED1]], i32* [[RL_RED1]], |
1357 | // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
1358 | // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], |
1359 | // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i16* |
1360 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
1361 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
1362 | // CHECK: [[GLOBAL_RED1:%.+]] = load i16, i16* [[GLOBAL_RED1_IDX_PTR]], |
1363 | // CHECK: store i16 [[GLOBAL_RED1]], i16* [[RL_RED1]], |
1364 | // CHECK: ret void |
1365 | |
1366 | // CHECK: define internal void [[GLOBAL_TO_RED_LIST_RED]](i8*, i32, i8*) |
1367 | // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, |
1368 | // CHECK: [[IDX_PTR:%.+]] = alloca i32, |
1369 | // CHECK: [[RL_PTR:%.+]] = alloca i8*, |
1370 | // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*], |
1371 | // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], |
1372 | // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], |
1373 | // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], |
1374 | // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], |
1375 | // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]* |
1376 | // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], |
1377 | // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
1378 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 |
1379 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
1380 | // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i32* [[GLOBAL_RED1_IDX_PTR]] to i8* |
1381 | // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] |
1382 | // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
1383 | // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 |
1384 | // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] |
1385 | // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i16* [[GLOBAL_RED1_IDX_PTR]] to i8* |
1386 | // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] |
1387 | // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8* |
1388 | // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], |
1389 | // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]]) |
1390 | // CHECK: ret void |
1391 | |
1392 | #endif |
1393 | |