CGOpenMPRuntimeNVPTX.cpp source code [clang_source_code/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp]

1	//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This provides a class for OpenMP runtime code generation specialized to NVPTX
10	// targets.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "CGOpenMPRuntimeNVPTX.h"
15	#include "CodeGenFunction.h"
16	#include "clang/AST/DeclOpenMP.h"
17	#include "clang/AST/StmtOpenMP.h"
18	#include "clang/AST/StmtVisitor.h"
19	#include "clang/Basic/Cuda.h"
20	#include "llvm/ADT/SmallPtrSet.h"
21
22	using namespace clang;
23	using namespace CodeGen;
24
25	namespace {
26	enum OpenMPRTLFunctionNVPTX {
27	/// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
28	/// int16_t RequiresOMPRuntime);
29	OMPRTL_NVPTX__kmpc_kernel_init,
30	/// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
31	OMPRTL_NVPTX__kmpc_kernel_deinit,
32	/// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
33	/// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
34	OMPRTL_NVPTX__kmpc_spmd_kernel_init,
35	/// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
36	OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2,
37	/// Call to void __kmpc_kernel_prepare_parallel(void
38	/// *outlined_function, int16_t
39	/// IsOMPRuntimeInitialized);
40	OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
41	/// Call to bool __kmpc_kernel_parallel(void **outlined_function,
42	/// int16_t IsOMPRuntimeInitialized);
43	OMPRTL_NVPTX__kmpc_kernel_parallel,
44	/// Call to void __kmpc_kernel_end_parallel();
45	OMPRTL_NVPTX__kmpc_kernel_end_parallel,
46	/// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
47	/// global_tid);
48	OMPRTL_NVPTX__kmpc_serialized_parallel,
49	/// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
50	/// global_tid);
51	OMPRTL_NVPTX__kmpc_end_serialized_parallel,
52	/// Call to int32_t __kmpc_shuffle_int32(int32_t element,
53	/// int16_t lane_offset, int16_t warp_size);
54	OMPRTL_NVPTX__kmpc_shuffle_int32,
55	/// Call to int64_t __kmpc_shuffle_int64(int64_t element,
56	/// int16_t lane_offset, int16_t warp_size);
57	OMPRTL_NVPTX__kmpc_shuffle_int64,
58	/// Call to __kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, kmp_int32
59	/// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
60	/// void (kmp_ShuffleReductFctPtr)(void rhsData, int16_t lane_id, int16_t
61	/// lane_offset, int16_t shortCircuit),
62	/// void (kmp_InterWarpCopyFctPtr)(void src, int32_t warp_num));
63	OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2,
64	/// Call to __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
65	/// global_tid, void global_buffer, int32_t num_of_records, void
66	/// reduce_data,
67	/// void (kmp_ShuffleReductFctPtr)(void rhsData, int16_t lane_id, int16_t
68	/// lane_offset, int16_t shortCircuit),
69	/// void (kmp_InterWarpCopyFctPtr)(void src, int32_t warp_num), void
70	/// (kmp_ListToGlobalCpyFctPtr)(void buffer, int idx, void *reduce_data),
71	/// void (kmp_GlobalToListCpyFctPtr)(void buffer, int idx,
72	/// void reduce_data), void (kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
73	/// int idx, void reduce_data), void (kmp_GlobalToListRedFctPtr)(void
74	/// buffer, int idx, void reduce_data));
75	OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2,
76	/// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
77	OMPRTL_NVPTX__kmpc_end_reduce_nowait,
78	/// Call to void __kmpc_data_sharing_init_stack();
79	OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
80	/// Call to void __kmpc_data_sharing_init_stack_spmd();
81	OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
82	/// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size,
83	/// int16_t UseSharedMemory);
84	OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack,
85	/// Call to void __kmpc_data_sharing_pop_stack(void *a);
86	OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
87	/// Call to void __kmpc_begin_sharing_variables(void ***args,
88	/// size_t n_args);
89	OMPRTL_NVPTX__kmpc_begin_sharing_variables,
90	/// Call to void __kmpc_end_sharing_variables();
91	OMPRTL_NVPTX__kmpc_end_sharing_variables,
92	/// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
93	OMPRTL_NVPTX__kmpc_get_shared_variables,
94	/// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
95	/// global_tid);
96	OMPRTL_NVPTX__kmpc_parallel_level,
97	/// Call to int8_t __kmpc_is_spmd_exec_mode();
98	OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
99	/// Call to void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
100	/// const void buf, size_t size, int16_t is_shared, const void *res);
101	OMPRTL_NVPTX__kmpc_get_team_static_memory,
102	/// Call to void __kmpc_restore_team_static_memory(int16_t
103	/// isSPMDExecutionMode, int16_t is_shared);
104	OMPRTL_NVPTX__kmpc_restore_team_static_memory,
105	/// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
106	OMPRTL__kmpc_barrier,
107	/// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
108	/// global_tid);
109	OMPRTL__kmpc_barrier_simple_spmd,
110	};
111
112	/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
113	class NVPTXActionTy final : public PrePostActionTy {
114	llvm::FunctionCallee EnterCallee = nullptr;
115	ArrayRef<llvm::Value *> EnterArgs;
116	llvm::FunctionCallee ExitCallee = nullptr;
117	ArrayRef<llvm::Value *> ExitArgs;
118	bool Conditional = false;
119	llvm::BasicBlock *ContBlock = nullptr;
120
121	public:
122	NVPTXActionTy(llvm::FunctionCallee EnterCallee,
123	ArrayRef<llvm::Value *> EnterArgs,
124	llvm::FunctionCallee ExitCallee,
125	ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
126	: EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
127	ExitArgs(ExitArgs), Conditional(Conditional) {}
128	void Enter(CodeGenFunction &CGF) override {
129	llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
130	if (Conditional) {
131	llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
132	auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
133	ContBlock = CGF.createBasicBlock("omp_if.end");
134	// Generate the branch (If-stmt)
135	CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
136	CGF.EmitBlock(ThenBlock);
137	}
138	}
139	void Done(CodeGenFunction &CGF) {
140	// Emit the rest of blocks/branches
141	CGF.EmitBranch(ContBlock);
142	CGF.EmitBlock(ContBlock, true);
143	}
144	void Exit(CodeGenFunction &CGF) override {
145	CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
146	}
147	};
148
149	/// A class to track the execution mode when codegening directives within
150	/// a target region. The appropriate mode (SPMD\|NON-SPMD) is set on entry
151	/// to the target region and used by containing directives such as 'parallel'
152	/// to emit optimized code.
153	class ExecutionRuntimeModesRAII {
154	private:
155	CGOpenMPRuntimeNVPTX::ExecutionMode SavedExecMode =
156	CGOpenMPRuntimeNVPTX::EM_Unknown;
157	CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode;
158	bool SavedRuntimeMode = false;
159	bool *RuntimeMode = nullptr;
160
161	public:
162	/// Constructor for Non-SPMD mode.
163	ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode)
164	: ExecMode(ExecMode) {
165	SavedExecMode = ExecMode;
166	ExecMode = CGOpenMPRuntimeNVPTX::EM_NonSPMD;
167	}
168	/// Constructor for SPMD mode.
169	ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode,
170	bool &RuntimeMode, bool FullRuntimeMode)
171	: ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
172	SavedExecMode = ExecMode;
173	SavedRuntimeMode = RuntimeMode;
174	ExecMode = CGOpenMPRuntimeNVPTX::EM_SPMD;
175	RuntimeMode = FullRuntimeMode;
176	}
177	~ExecutionRuntimeModesRAII() {
178	ExecMode = SavedExecMode;
179	if (RuntimeMode)
180	*RuntimeMode = SavedRuntimeMode;
181	}
182	};
183
184	/// GPU Configuration: This information can be derived from cuda registers,
185	/// however, providing compile time constants helps generate more efficient
186	/// code. For all practical purposes this is fine because the configuration
187	/// is the same for all known NVPTX architectures.
188	enum MachineConfiguration : unsigned {
189	WarpSize = 32,
190	/// Number of bits required to represent a lane identifier, which is
191	/// computed as log_2(WarpSize).
192	LaneIDBits = 5,
193	LaneIDMask = WarpSize - 1,
194
195	/// Global memory alignment for performance.
196	GlobalMemoryAlignment = 128,
197
198	/// Maximal size of the shared memory buffer.
199	SharedMemorySize = 128,
200	};
201
202	static const ValueDecl getPrivateItem(const Expr RefExpr) {
203	RefExpr = RefExpr->IgnoreParens();
204	if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
205	const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
206	while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
207	Base = TempASE->getBase()->IgnoreParenImpCasts();
208	RefExpr = Base;
209	} else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
210	const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
211	while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
212	Base = TempOASE->getBase()->IgnoreParenImpCasts();
213	while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
214	Base = TempASE->getBase()->IgnoreParenImpCasts();
215	RefExpr = Base;
216	}
217	RefExpr = RefExpr->IgnoreParenImpCasts();
218	if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
219	return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
220	const auto *ME = cast<MemberExpr>(RefExpr);
221	return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
222	}
223
224	typedef std::pair<CharUnits /Align/, const ValueDecl *> VarsDataTy;
225	static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
226	return P1.first > P2.first;
227	}
228
229	static RecordDecl *buildRecordForGlobalizedVars(
230	ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
231	ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
232	llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
233	&MappedDeclsFields, int BufSize) {
234	if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
235	return nullptr;
236	SmallVector<VarsDataTy, 4> GlobalizedVars;
237	for (const ValueDecl *D : EscapedDecls)
238	GlobalizedVars.emplace_back(
239	CharUnits::fromQuantity(std::max(
240	C.getDeclAlign(D).getQuantity(),
241	static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
242	D);
243	for (const ValueDecl *D : EscapedDeclsForTeams)
244	GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
245	std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
246	stable_sort_comparator);
247	// Build struct _globalized_locals_ty {
248	// /* globalized vars */[WarSize] align (max(decl_align,
249	// GlobalMemoryAlignment))
250	// /* globalized vars */ for EscapedDeclsForTeams
251	// };
252	RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
253	GlobalizedRD->startDefinition();
254	llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
255	EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
256	for (const auto &Pair : GlobalizedVars) {
257	const ValueDecl *VD = Pair.second;
258	QualType Type = VD->getType();
259	if (Type->isLValueReferenceType())
260	Type = C.getPointerType(Type.getNonReferenceType());
261	else
262	Type = Type.getNonReferenceType();
263	SourceLocation Loc = VD->getLocation();
264	FieldDecl *Field;
265	if (SingleEscaped.count(VD)) {
266	Field = FieldDecl::Create(
267	C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
268	C.getTrivialTypeSourceInfo(Type, SourceLocation()),
269	/BW=/nullptr, /Mutable=/false,
270	/InitStyle=/ICIS_NoInit);
271	Field->setAccess(AS_public);
272	if (VD->hasAttrs()) {
273	for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
274	E(VD->getAttrs().end());
275	I != E; ++I)
276	Field->addAttr(*I);
277	}
278	} else {
279	llvm::APInt ArraySize(32, BufSize);
280	Type = C.getConstantArrayType(Type, ArraySize, ArrayType::Normal, 0);
281	Field = FieldDecl::Create(
282	C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
283	C.getTrivialTypeSourceInfo(Type, SourceLocation()),
284	/BW=/nullptr, /Mutable=/false,
285	/InitStyle=/ICIS_NoInit);
286	Field->setAccess(AS_public);
287	llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
288	static_cast<CharUnits::QuantityType>(
289	GlobalMemoryAlignment)));
290	Field->addAttr(AlignedAttr::CreateImplicit(
291	C, AlignedAttr::GNU_aligned, /IsAlignmentExpr=/true,
292	IntegerLiteral::Create(C, Align,
293	C.getIntTypeForBitwidth(32, /Signed=/0),
294	SourceLocation())));
295	}
296	GlobalizedRD->addDecl(Field);
297	MappedDeclsFields.try_emplace(VD, Field);
298	}
299	GlobalizedRD->completeDefinition();
300	return GlobalizedRD;
301	}
302
303	/// Get the list of variables that can escape their declaration context.
304	class CheckVarsEscapingDeclContext final
305	: public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
306	CodeGenFunction &CGF;
307	llvm::SetVector<const ValueDecl *> EscapedDecls;
308	llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
309	llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
310	RecordDecl *GlobalizedRD = nullptr;
311	llvm::SmallDenseMap<const ValueDecl , const FieldDecl > MappedDeclsFields;
312	bool AllEscaped = false;
313	bool IsForCombinedParallelRegion = false;
314
315	void markAsEscaped(const ValueDecl *VD) {
316	// Do not globalize declare target variables.
317	if (!isa<VarDecl>(VD) \|\|
318	OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
319	return;
320	VD = cast<ValueDecl>(VD->getCanonicalDecl());
321	// Variables captured by value must be globalized.
322	if (auto *CSI = CGF.CapturedStmtInfo) {
323	if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
324	// Check if need to capture the variable that was already captured by
325	// value in the outer region.
326	if (!IsForCombinedParallelRegion) {
327	if (!FD->hasAttrs())
328	return;
329	const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
330	if (!Attr)
331	return;
332	if (((Attr->getCaptureKind() != OMPC_map) &&
333	!isOpenMPPrivate(
334	static_cast<OpenMPClauseKind>(Attr->getCaptureKind()))) \|\|
335	((Attr->getCaptureKind() == OMPC_map) &&
336	!FD->getType()->isAnyPointerType()))
337	return;
338	}
339	if (!FD->getType()->isReferenceType()) {
340	(0) . __assert_fail ("!VD->getType()->isVariablyModifiedType() && \"Parameter captured by value with variably modified type\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 341, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(!VD->getType()->isVariablyModifiedType() &&
341	(0) . __assert_fail ("!VD->getType()->isVariablyModifiedType() && \"Parameter captured by value with variably modified type\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 341, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Parameter captured by value with variably modified type");
342	EscapedParameters.insert(VD);
343	} else if (!IsForCombinedParallelRegion) {
344	return;
345	}
346	}
347	}
348	if ((!CGF.CapturedStmtInfo \|\|
349	(IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
350	VD->getType()->isReferenceType())
351	// Do not globalize variables with reference type.
352	return;
353	if (VD->getType()->isVariablyModifiedType())
354	EscapedVariableLengthDecls.insert(VD);
355	else
356	EscapedDecls.insert(VD);
357	}
358
359	void VisitValueDecl(const ValueDecl *VD) {
360	if (VD->getType()->isLValueReferenceType())
361	markAsEscaped(VD);
362	if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
363	if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
364	const bool SavedAllEscaped = AllEscaped;
365	AllEscaped = VD->getType()->isLValueReferenceType();
366	Visit(VarD->getInit());
367	AllEscaped = SavedAllEscaped;
368	}
369	}
370	}
371	void VisitOpenMPCapturedStmt(const CapturedStmt *S,
372	ArrayRef<OMPClause *> Clauses,
373	bool IsCombinedParallelRegion) {
374	if (!S)
375	return;
376	for (const CapturedStmt::Capture &C : S->captures()) {
377	if (C.capturesVariable() && !C.capturesVariableByCopy()) {
378	const ValueDecl *VD = C.getCapturedVar();
379	bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
380	if (IsCombinedParallelRegion) {
381	// Check if the variable is privatized in the combined construct and
382	// those private copies must be shared in the inner parallel
383	// directive.
384	IsForCombinedParallelRegion = false;
385	for (const OMPClause *C : Clauses) {
386	if (!isOpenMPPrivate(C->getClauseKind()) \|\|
387	C->getClauseKind() == OMPC_reduction \|\|
388	C->getClauseKind() == OMPC_linear \|\|
389	C->getClauseKind() == OMPC_private)
390	continue;
391	ArrayRef<const Expr *> Vars;
392	if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
393	Vars = PC->getVarRefs();
394	else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
395	Vars = PC->getVarRefs();
396	else
397	llvm_unreachable("Unexpected clause.");
398	for (const auto *E : Vars) {
399	const Decl *D =
400	cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
401	if (D == VD->getCanonicalDecl()) {
402	IsForCombinedParallelRegion = true;
403	break;
404	}
405	}
406	if (IsForCombinedParallelRegion)
407	break;
408	}
409	}
410	markAsEscaped(VD);
411	if (isa<OMPCapturedExprDecl>(VD))
412	VisitValueDecl(VD);
413	IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
414	}
415	}
416	}
417
418	void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
419	(0) . __assert_fail ("!GlobalizedRD && \"Record for globalized variables is built already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 420, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(!GlobalizedRD &&
420	(0) . __assert_fail ("!GlobalizedRD && \"Record for globalized variables is built already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 420, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Record for globalized variables is built already.");
421	ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
422	if (IsInTTDRegion)
423	EscapedDeclsForTeams = EscapedDecls.getArrayRef();
424	else
425	EscapedDeclsForParallel = EscapedDecls.getArrayRef();
426	GlobalizedRD = ::buildRecordForGlobalizedVars(
427	CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
428	MappedDeclsFields, WarpSize);
429	}
430
431	public:
432	CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
433	ArrayRef<const ValueDecl *> TeamsReductions)
434	: CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
435	}
436	virtual ~CheckVarsEscapingDeclContext() = default;
437	void VisitDeclStmt(const DeclStmt *S) {
438	if (!S)
439	return;
440	for (const Decl *D : S->decls())
441	if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
442	VisitValueDecl(VD);
443	}
444	void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
445	if (!D)
446	return;
447	if (!D->hasAssociatedStmt())
448	return;
449	if (const auto *S =
450	dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
451	// Do not analyze directives that do not actually require capturing,
452	// like `omp for` or `omp simd` directives.
453	llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
454	getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
455	if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
456	VisitStmt(S->getCapturedStmt());
457	return;
458	}
459	VisitOpenMPCapturedStmt(
460	S, D->clauses(),
461	CaptureRegions.back() == OMPD_parallel &&
462	isOpenMPDistributeDirective(D->getDirectiveKind()));
463	}
464	}
465	void VisitCapturedStmt(const CapturedStmt *S) {
466	if (!S)
467	return;
468	for (const CapturedStmt::Capture &C : S->captures()) {
469	if (C.capturesVariable() && !C.capturesVariableByCopy()) {
470	const ValueDecl *VD = C.getCapturedVar();
471	markAsEscaped(VD);
472	if (isa<OMPCapturedExprDecl>(VD))
473	VisitValueDecl(VD);
474	}
475	}
476	}
477	void VisitLambdaExpr(const LambdaExpr *E) {
478	if (!E)
479	return;
480	for (const LambdaCapture &C : E->captures()) {
481	if (C.capturesVariable()) {
482	if (C.getCaptureKind() == LCK_ByRef) {
483	const ValueDecl *VD = C.getCapturedVar();
484	markAsEscaped(VD);
485	if (E->isInitCapture(&C) \|\| isa<OMPCapturedExprDecl>(VD))
486	VisitValueDecl(VD);
487	}
488	}
489	}
490	}
491	void VisitBlockExpr(const BlockExpr *E) {
492	if (!E)
493	return;
494	for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
495	if (C.isByRef()) {
496	const VarDecl *VD = C.getVariable();
497	markAsEscaped(VD);
498	if (isa<OMPCapturedExprDecl>(VD) \|\| VD->isInitCapture())
499	VisitValueDecl(VD);
500	}
501	}
502	}
503	void VisitCallExpr(const CallExpr *E) {
504	if (!E)
505	return;
506	for (const Expr *Arg : E->arguments()) {
507	if (!Arg)
508	continue;
509	if (Arg->isLValue()) {
510	const bool SavedAllEscaped = AllEscaped;
511	AllEscaped = true;
512	Visit(Arg);
513	AllEscaped = SavedAllEscaped;
514	} else {
515	Visit(Arg);
516	}
517	}
518	Visit(E->getCallee());
519	}
520	void VisitDeclRefExpr(const DeclRefExpr *E) {
521	if (!E)
522	return;
523	const ValueDecl *VD = E->getDecl();
524	if (AllEscaped)
525	markAsEscaped(VD);
526	if (isa<OMPCapturedExprDecl>(VD))
527	VisitValueDecl(VD);
528	else if (const auto *VarD = dyn_cast<VarDecl>(VD))
529	if (VarD->isInitCapture())
530	VisitValueDecl(VD);
531	}
532	void VisitUnaryOperator(const UnaryOperator *E) {
533	if (!E)
534	return;
535	if (E->getOpcode() == UO_AddrOf) {
536	const bool SavedAllEscaped = AllEscaped;
537	AllEscaped = true;
538	Visit(E->getSubExpr());
539	AllEscaped = SavedAllEscaped;
540	} else {
541	Visit(E->getSubExpr());
542	}
543	}
544	void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
545	if (!E)
546	return;
547	if (E->getCastKind() == CK_ArrayToPointerDecay) {
548	const bool SavedAllEscaped = AllEscaped;
549	AllEscaped = true;
550	Visit(E->getSubExpr());
551	AllEscaped = SavedAllEscaped;
552	} else {
553	Visit(E->getSubExpr());
554	}
555	}
556	void VisitExpr(const Expr *E) {
557	if (!E)
558	return;
559	bool SavedAllEscaped = AllEscaped;
560	if (!E->isLValue())
561	AllEscaped = false;
562	for (const Stmt *Child : E->children())
563	if (Child)
564	Visit(Child);
565	AllEscaped = SavedAllEscaped;
566	}
567	void VisitStmt(const Stmt *S) {
568	if (!S)
569	return;
570	for (const Stmt *Child : S->children())
571	if (Child)
572	Visit(Child);
573	}
574
575	/// Returns the record that handles all the escaped local variables and used
576	/// instead of their original storage.
577	const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
578	if (!GlobalizedRD)
579	buildRecordForGlobalizedVars(IsInTTDRegion);
580	return GlobalizedRD;
581	}
582
583	/// Returns the field in the globalized record for the escaped variable.
584	const FieldDecl getFieldForGlobalizedVar(const ValueDecl VD) const {
585	(0) . __assert_fail ("GlobalizedRD && \"Record for globalized variables must be generated already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 586, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(GlobalizedRD &&
586	(0) . __assert_fail ("GlobalizedRD && \"Record for globalized variables must be generated already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 586, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Record for globalized variables must be generated already.");
587	auto I = MappedDeclsFields.find(VD);
588	if (I == MappedDeclsFields.end())
589	return nullptr;
590	return I->getSecond();
591	}
592
593	/// Returns the list of the escaped local variables/parameters.
594	ArrayRef<const ValueDecl *> getEscapedDecls() const {
595	return EscapedDecls.getArrayRef();
596	}
597
598	/// Checks if the escaped local variable is actually a parameter passed by
599	/// value.
600	const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
601	return EscapedParameters;
602	}
603
604	/// Returns the list of the escaped variables with the variably modified
605	/// types.
606	ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
607	return EscapedVariableLengthDecls.getArrayRef();
608	}
609	};
610	} // anonymous namespace
611
612	/// Get the GPU warp size.
613	static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
614	return CGF.EmitRuntimeCall(
615	llvm::Intrinsic::getDeclaration(
616	&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
617	"nvptx_warp_size");
618	}
619
620	/// Get the id of the current thread on the GPU.
621	static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
622	return CGF.EmitRuntimeCall(
623	llvm::Intrinsic::getDeclaration(
624	&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
625	"nvptx_tid");
626	}
627
628	/// Get the id of the warp in the block.
629	/// We assume that the warp size is 32, which is always the case
630	/// on the NVPTX device, to generate more efficient code.
631	static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
632	CGBuilderTy &Bld = CGF.Builder;
633	return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
634	}
635
636	/// Get the id of the current lane in the Warp.
637	/// We assume that the warp size is 32, which is always the case
638	/// on the NVPTX device, to generate more efficient code.
639	static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
640	CGBuilderTy &Bld = CGF.Builder;
641	return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
642	"nvptx_lane_id");
643	}
644
645	/// Get the maximum number of threads in a block of the GPU.
646	static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
647	return CGF.EmitRuntimeCall(
648	llvm::Intrinsic::getDeclaration(
649	&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
650	"nvptx_num_threads");
651	}
652
653	/// Get the value of the thread_limit clause in the teams directive.
654	/// For the 'generic' execution mode, the runtime encodes thread_limit in
655	/// the launch parameters, always starting thread_limit+warpSize threads per
656	/// CTA. The threads in the last warp are reserved for master execution.
657	/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
658	static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
659	bool IsInSPMDExecutionMode = false) {
660	CGBuilderTy &Bld = CGF.Builder;
661	return IsInSPMDExecutionMode
662	? getNVPTXNumThreads(CGF)
663	: Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
664	"thread_limit");
665	}
666
667	/// Get the thread id of the OMP master thread.
668	/// The master thread id is the first thread (lane) of the last warp in the
669	/// GPU block. Warp size is assumed to be some power of 2.
670	/// Thread id is 0 indexed.
671	/// E.g: If NumThreads is 33, master id is 32.
672	/// If NumThreads is 64, master id is 32.
673	/// If NumThreads is 1024, master id is 992.
674	static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
675	CGBuilderTy &Bld = CGF.Builder;
676	llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
677
678	// We assume that the warp size is a power of 2.
679	llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
680
681	return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
682	Bld.CreateNot(Mask), "master_tid");
683	}
684
685	CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
686	CodeGenModule &CGM, SourceLocation Loc)
687	: WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
688	Loc(Loc) {
689	createWorkerFunction(CGM);
690	}
691
692	void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
693	CodeGenModule &CGM) {
694	// Create an worker function with no arguments.
695
696	WorkerFn = llvm::Function::Create(
697	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
698	/placeholder=/"_worker", &CGM.getModule());
699	CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
700	WorkerFn->setDoesNotRecurse();
701	}
702
703	CGOpenMPRuntimeNVPTX::ExecutionMode
704	CGOpenMPRuntimeNVPTX::getExecutionMode() const {
705	return CurrentExecutionMode;
706	}
707
708	static CGOpenMPRuntimeNVPTX::DataSharingMode
709	getDataSharingMode(CodeGenModule &CGM) {
710	return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
711	: CGOpenMPRuntimeNVPTX::Generic;
712	}
713
714	/// Checks if the expression is constant or does not have non-trivial function
715	/// calls.
716	static bool isTrivial(ASTContext &Ctx, const Expr * E) {
717	// We can skip constant expressions.
718	// We can skip expressions with trivial calls or simple expressions.
719	return (E->isEvaluatable(Ctx, Expr::SE_AllowUndefinedBehavior) \|\|
720	!E->hasNonTrivialCall(Ctx)) &&
721	!E->HasSideEffects(Ctx, /IncludePossibleEffects=/true);
722	}
723
724	/// Checks if the \p Body is the \a CompoundStmt and returns its child statement
725	/// iff there is only one that is not evaluatable at the compile time.
726	static const Stmt getSingleCompoundChild(ASTContext &Ctx, const Stmt Body) {
727	if (const auto *C = dyn_cast<CompoundStmt>(Body)) {
728	const Stmt *Child = nullptr;
729	for (const Stmt *S : C->body()) {
730	if (const auto *E = dyn_cast<Expr>(S)) {
731	if (isTrivial(Ctx, E))
732	continue;
733	}
734	// Some of the statements can be ignored.
735	if (isa<AsmStmt>(S) \|\| isa<NullStmt>(S) \|\| isa<OMPFlushDirective>(S) \|\|
736	isa<OMPBarrierDirective>(S) \|\| isa<OMPTaskyieldDirective>(S))
737	continue;
738	// Analyze declarations.
739	if (const auto *DS = dyn_cast<DeclStmt>(S)) {
740	if (llvm::all_of(DS->decls(), [&Ctx](const Decl *D) {
741	if (isa<EmptyDecl>(D) \|\| isa<DeclContext>(D) \|\|
742	isa<TypeDecl>(D) \|\| isa<PragmaCommentDecl>(D) \|\|
743	isa<PragmaDetectMismatchDecl>(D) \|\| isa<UsingDecl>(D) \|\|
744	isa<UsingDirectiveDecl>(D) \|\|
745	isa<OMPDeclareReductionDecl>(D) \|\|
746	isa<OMPThreadPrivateDecl>(D) \|\| isa<OMPAllocateDecl>(D))
747	return true;
748	const auto *VD = dyn_cast<VarDecl>(D);
749	if (!VD)
750	return false;
751	return VD->isConstexpr() \|\|
752	((VD->getType().isTrivialType(Ctx) \|\|
753	VD->getType()->isReferenceType()) &&
754	(!VD->hasInit() \|\| isTrivial(Ctx, VD->getInit())));
755	}))
756	continue;
757	}
758	// Found multiple children - cannot get the one child only.
759	if (Child)
760	return Body;
761	Child = S;
762	}
763	if (Child)
764	return Child;
765	}
766	return Body;
767	}
768
769	/// Check if the parallel directive has an 'if' clause with non-constant or
770	/// false condition. Also, check if the number of threads is strictly specified
771	/// and run those directives in non-SPMD mode.
772	static bool hasParallelIfNumThreadsClause(ASTContext &Ctx,
773	const OMPExecutableDirective &D) {
774	if (D.hasClausesOfKind<OMPNumThreadsClause>())
775	return true;
776	for (const auto *C : D.getClausesOfKind<OMPIfClause>()) {
777	OpenMPDirectiveKind NameModifier = C->getNameModifier();
778	if (NameModifier != OMPD_parallel && NameModifier != OMPD_unknown)
779	continue;
780	const Expr *Cond = C->getCondition();
781	bool Result;
782	if (!Cond->EvaluateAsBooleanCondition(Result, Ctx) \|\| !Result)
783	return true;
784	}
785	return false;
786	}
787
788	/// Check for inner (nested) SPMD construct, if any
789	static bool hasNestedSPMDDirective(ASTContext &Ctx,
790	const OMPExecutableDirective &D) {
791	const auto *CS = D.getInnermostCapturedStmt();
792	const auto *Body =
793	CS->getCapturedStmt()->IgnoreContainers(/IgnoreCaptured=/true);
794	const Stmt *ChildStmt = getSingleCompoundChild(Ctx, Body);
795
796	if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
797	OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
798	switch (D.getDirectiveKind()) {
799	case OMPD_target:
800	if (isOpenMPParallelDirective(DKind) &&
801	!hasParallelIfNumThreadsClause(Ctx, *NestedDir))
802	return true;
803	if (DKind == OMPD_teams) {
804	Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
805	/IgnoreCaptured=/true);
806	if (!Body)
807	return false;
808	ChildStmt = getSingleCompoundChild(Ctx, Body);
809	if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
810	DKind = NND->getDirectiveKind();
811	if (isOpenMPParallelDirective(DKind) &&
812	!hasParallelIfNumThreadsClause(Ctx, *NND))
813	return true;
814	}
815	}
816	return false;
817	case OMPD_target_teams:
818	return isOpenMPParallelDirective(DKind) &&
819	!hasParallelIfNumThreadsClause(Ctx, *NestedDir);
820	case OMPD_target_simd:
821	case OMPD_target_parallel:
822	case OMPD_target_parallel_for:
823	case OMPD_target_parallel_for_simd:
824	case OMPD_target_teams_distribute:
825	case OMPD_target_teams_distribute_simd:
826	case OMPD_target_teams_distribute_parallel_for:
827	case OMPD_target_teams_distribute_parallel_for_simd:
828	case OMPD_parallel:
829	case OMPD_for:
830	case OMPD_parallel_for:
831	case OMPD_parallel_sections:
832	case OMPD_for_simd:
833	case OMPD_parallel_for_simd:
834	case OMPD_cancel:
835	case OMPD_cancellation_point:
836	case OMPD_ordered:
837	case OMPD_threadprivate:
838	case OMPD_allocate:
839	case OMPD_task:
840	case OMPD_simd:
841	case OMPD_sections:
842	case OMPD_section:
843	case OMPD_single:
844	case OMPD_master:
845	case OMPD_critical:
846	case OMPD_taskyield:
847	case OMPD_barrier:
848	case OMPD_taskwait:
849	case OMPD_taskgroup:
850	case OMPD_atomic:
851	case OMPD_flush:
852	case OMPD_teams:
853	case OMPD_target_data:
854	case OMPD_target_exit_data:
855	case OMPD_target_enter_data:
856	case OMPD_distribute:
857	case OMPD_distribute_simd:
858	case OMPD_distribute_parallel_for:
859	case OMPD_distribute_parallel_for_simd:
860	case OMPD_teams_distribute:
861	case OMPD_teams_distribute_simd:
862	case OMPD_teams_distribute_parallel_for:
863	case OMPD_teams_distribute_parallel_for_simd:
864	case OMPD_target_update:
865	case OMPD_declare_simd:
866	case OMPD_declare_target:
867	case OMPD_end_declare_target:
868	case OMPD_declare_reduction:
869	case OMPD_declare_mapper:
870	case OMPD_taskloop:
871	case OMPD_taskloop_simd:
872	case OMPD_requires:
873	case OMPD_unknown:
874	llvm_unreachable("Unexpected directive.");
875	}
876	}
877
878	return false;
879	}
880
881	static bool supportsSPMDExecutionMode(ASTContext &Ctx,
882	const OMPExecutableDirective &D) {
883	OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
884	switch (DirectiveKind) {
885	case OMPD_target:
886	case OMPD_target_teams:
887	return hasNestedSPMDDirective(Ctx, D);
888	case OMPD_target_parallel:
889	case OMPD_target_parallel_for:
890	case OMPD_target_parallel_for_simd:
891	case OMPD_target_teams_distribute_parallel_for:
892	case OMPD_target_teams_distribute_parallel_for_simd:
893	return !hasParallelIfNumThreadsClause(Ctx, D);
894	case OMPD_target_simd:
895	case OMPD_target_teams_distribute:
896	case OMPD_target_teams_distribute_simd:
897	return false;
898	case OMPD_parallel:
899	case OMPD_for:
900	case OMPD_parallel_for:
901	case OMPD_parallel_sections:
902	case OMPD_for_simd:
903	case OMPD_parallel_for_simd:
904	case OMPD_cancel:
905	case OMPD_cancellation_point:
906	case OMPD_ordered:
907	case OMPD_threadprivate:
908	case OMPD_allocate:
909	case OMPD_task:
910	case OMPD_simd:
911	case OMPD_sections:
912	case OMPD_section:
913	case OMPD_single:
914	case OMPD_master:
915	case OMPD_critical:
916	case OMPD_taskyield:
917	case OMPD_barrier:
918	case OMPD_taskwait:
919	case OMPD_taskgroup:
920	case OMPD_atomic:
921	case OMPD_flush:
922	case OMPD_teams:
923	case OMPD_target_data:
924	case OMPD_target_exit_data:
925	case OMPD_target_enter_data:
926	case OMPD_distribute:
927	case OMPD_distribute_simd:
928	case OMPD_distribute_parallel_for:
929	case OMPD_distribute_parallel_for_simd:
930	case OMPD_teams_distribute:
931	case OMPD_teams_distribute_simd:
932	case OMPD_teams_distribute_parallel_for:
933	case OMPD_teams_distribute_parallel_for_simd:
934	case OMPD_target_update:
935	case OMPD_declare_simd:
936	case OMPD_declare_target:
937	case OMPD_end_declare_target:
938	case OMPD_declare_reduction:
939	case OMPD_declare_mapper:
940	case OMPD_taskloop:
941	case OMPD_taskloop_simd:
942	case OMPD_requires:
943	case OMPD_unknown:
944	break;
945	}
946	llvm_unreachable(
947	"Unknown programming model for OpenMP directive on NVPTX target.");
948	}
949
950	/// Check if the directive is loops based and has schedule clause at all or has
951	/// static scheduling.
952	static bool hasStaticScheduling(const OMPExecutableDirective &D) {
953	(0) . __assert_fail ("isOpenMPWorksharingDirective(D.getDirectiveKind()) && isOpenMPLoopDirective(D.getDirectiveKind()) && \"Expected loop-based directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 955, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
954	(0) . __assert_fail ("isOpenMPWorksharingDirective(D.getDirectiveKind()) && isOpenMPLoopDirective(D.getDirectiveKind()) && \"Expected loop-based directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 955, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> isOpenMPLoopDirective(D.getDirectiveKind()) &&
955	(0) . __assert_fail ("isOpenMPWorksharingDirective(D.getDirectiveKind()) && isOpenMPLoopDirective(D.getDirectiveKind()) && \"Expected loop-based directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 955, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Expected loop-based directive.");
956	return !D.hasClausesOfKind<OMPOrderedClause>() &&
957	(!D.hasClausesOfKind<OMPScheduleClause>() \|\|
958	llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
959	[](const OMPScheduleClause *C) {
960	return C->getScheduleKind() == OMPC_SCHEDULE_static;
961	}));
962	}
963
964	/// Check for inner (nested) lightweight runtime construct, if any
965	static bool hasNestedLightweightDirective(ASTContext &Ctx,
966	const OMPExecutableDirective &D) {
967	(0) . __assert_fail ("supportsSPMDExecutionMode(Ctx, D) && \"Expected SPMD mode directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 967, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
968	const auto *CS = D.getInnermostCapturedStmt();
969	const auto *Body =
970	CS->getCapturedStmt()->IgnoreContainers(/IgnoreCaptured=/true);
971	const Stmt *ChildStmt = getSingleCompoundChild(Ctx, Body);
972
973	if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
974	OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
975	switch (D.getDirectiveKind()) {
976	case OMPD_target:
977	if (isOpenMPParallelDirective(DKind) &&
978	isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
979	hasStaticScheduling(*NestedDir))
980	return true;
981	if (DKind == OMPD_parallel) {
982	Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
983	/IgnoreCaptured=/true);
984	if (!Body)
985	return false;
986	ChildStmt = getSingleCompoundChild(Ctx, Body);
987	if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
988	DKind = NND->getDirectiveKind();
989	if (isOpenMPWorksharingDirective(DKind) &&
990	isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
991	return true;
992	}
993	} else if (DKind == OMPD_teams) {
994	Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
995	/IgnoreCaptured=/true);
996	if (!Body)
997	return false;
998	ChildStmt = getSingleCompoundChild(Ctx, Body);
999	if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
1000	DKind = NND->getDirectiveKind();
1001	if (isOpenMPParallelDirective(DKind) &&
1002	isOpenMPWorksharingDirective(DKind) &&
1003	isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
1004	return true;
1005	if (DKind == OMPD_parallel) {
1006	Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
1007	/IgnoreCaptured=/true);
1008	if (!Body)
1009	return false;
1010	ChildStmt = getSingleCompoundChild(Ctx, Body);
1011	if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
1012	DKind = NND->getDirectiveKind();
1013	if (isOpenMPWorksharingDirective(DKind) &&
1014	isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
1015	return true;
1016	}
1017	}
1018	}
1019	}
1020	return false;
1021	case OMPD_target_teams:
1022	if (isOpenMPParallelDirective(DKind) &&
1023	isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
1024	hasStaticScheduling(*NestedDir))
1025	return true;
1026	if (DKind == OMPD_parallel) {
1027	Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
1028	/IgnoreCaptured=/true);
1029	if (!Body)
1030	return false;
1031	ChildStmt = getSingleCompoundChild(Ctx, Body);
1032	if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
1033	DKind = NND->getDirectiveKind();
1034	if (isOpenMPWorksharingDirective(DKind) &&
1035	isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
1036	return true;
1037	}
1038	}
1039	return false;
1040	case OMPD_target_parallel:
1041	return isOpenMPWorksharingDirective(DKind) &&
1042	isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
1043	case OMPD_target_teams_distribute:
1044	case OMPD_target_simd:
1045	case OMPD_target_parallel_for:
1046	case OMPD_target_parallel_for_simd:
1047	case OMPD_target_teams_distribute_simd:
1048	case OMPD_target_teams_distribute_parallel_for:
1049	case OMPD_target_teams_distribute_parallel_for_simd:
1050	case OMPD_parallel:
1051	case OMPD_for:
1052	case OMPD_parallel_for:
1053	case OMPD_parallel_sections:
1054	case OMPD_for_simd:
1055	case OMPD_parallel_for_simd:
1056	case OMPD_cancel:
1057	case OMPD_cancellation_point:
1058	case OMPD_ordered:
1059	case OMPD_threadprivate:
1060	case OMPD_allocate:
1061	case OMPD_task:
1062	case OMPD_simd:
1063	case OMPD_sections:
1064	case OMPD_section:
1065	case OMPD_single:
1066	case OMPD_master:
1067	case OMPD_critical:
1068	case OMPD_taskyield:
1069	case OMPD_barrier:
1070	case OMPD_taskwait:
1071	case OMPD_taskgroup:
1072	case OMPD_atomic:
1073	case OMPD_flush:
1074	case OMPD_teams:
1075	case OMPD_target_data:
1076	case OMPD_target_exit_data:
1077	case OMPD_target_enter_data:
1078	case OMPD_distribute:
1079	case OMPD_distribute_simd:
1080	case OMPD_distribute_parallel_for:
1081	case OMPD_distribute_parallel_for_simd:
1082	case OMPD_teams_distribute:
1083	case OMPD_teams_distribute_simd:
1084	case OMPD_teams_distribute_parallel_for:
1085	case OMPD_teams_distribute_parallel_for_simd:
1086	case OMPD_target_update:
1087	case OMPD_declare_simd:
1088	case OMPD_declare_target:
1089	case OMPD_end_declare_target:
1090	case OMPD_declare_reduction:
1091	case OMPD_declare_mapper:
1092	case OMPD_taskloop:
1093	case OMPD_taskloop_simd:
1094	case OMPD_requires:
1095	case OMPD_unknown:
1096	llvm_unreachable("Unexpected directive.");
1097	}
1098	}
1099
1100	return false;
1101	}
1102
1103	/// Checks if the construct supports lightweight runtime. It must be SPMD
1104	/// construct + inner loop-based construct with static scheduling.
1105	static bool supportsLightweightRuntime(ASTContext &Ctx,
1106	const OMPExecutableDirective &D) {
1107	if (!supportsSPMDExecutionMode(Ctx, D))
1108	return false;
1109	OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
1110	switch (DirectiveKind) {
1111	case OMPD_target:
1112	case OMPD_target_teams:
1113	case OMPD_target_parallel:
1114	return hasNestedLightweightDirective(Ctx, D);
1115	case OMPD_target_parallel_for:
1116	case OMPD_target_parallel_for_simd:
1117	case OMPD_target_teams_distribute_parallel_for:
1118	case OMPD_target_teams_distribute_parallel_for_simd:
1119	// (Last\|First)-privates must be shared in parallel region.
1120	return hasStaticScheduling(D);
1121	case OMPD_target_simd:
1122	case OMPD_target_teams_distribute:
1123	case OMPD_target_teams_distribute_simd:
1124	return false;
1125	case OMPD_parallel:
1126	case OMPD_for:
1127	case OMPD_parallel_for:
1128	case OMPD_parallel_sections:
1129	case OMPD_for_simd:
1130	case OMPD_parallel_for_simd:
1131	case OMPD_cancel:
1132	case OMPD_cancellation_point:
1133	case OMPD_ordered:
1134	case OMPD_threadprivate:
1135	case OMPD_allocate:
1136	case OMPD_task:
1137	case OMPD_simd:
1138	case OMPD_sections:
1139	case OMPD_section:
1140	case OMPD_single:
1141	case OMPD_master:
1142	case OMPD_critical:
1143	case OMPD_taskyield:
1144	case OMPD_barrier:
1145	case OMPD_taskwait:
1146	case OMPD_taskgroup:
1147	case OMPD_atomic:
1148	case OMPD_flush:
1149	case OMPD_teams:
1150	case OMPD_target_data:
1151	case OMPD_target_exit_data:
1152	case OMPD_target_enter_data:
1153	case OMPD_distribute:
1154	case OMPD_distribute_simd:
1155	case OMPD_distribute_parallel_for:
1156	case OMPD_distribute_parallel_for_simd:
1157	case OMPD_teams_distribute:
1158	case OMPD_teams_distribute_simd:
1159	case OMPD_teams_distribute_parallel_for:
1160	case OMPD_teams_distribute_parallel_for_simd:
1161	case OMPD_target_update:
1162	case OMPD_declare_simd:
1163	case OMPD_declare_target:
1164	case OMPD_end_declare_target:
1165	case OMPD_declare_reduction:
1166	case OMPD_declare_mapper:
1167	case OMPD_taskloop:
1168	case OMPD_taskloop_simd:
1169	case OMPD_requires:
1170	case OMPD_unknown:
1171	break;
1172	}
1173	llvm_unreachable(
1174	"Unknown programming model for OpenMP directive on NVPTX target.");
1175	}
1176
1177	void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
1178	StringRef ParentName,
1179	llvm::Function *&OutlinedFn,
1180	llvm::Constant *&OutlinedFnID,
1181	bool IsOffloadEntry,
1182	const RegionCodeGenTy &CodeGen) {
1183	ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1184	EntryFunctionState EST;
1185	WorkerFunctionState WST(CGM, D.getBeginLoc());
1186	Work.clear();
1187	WrapperFunctionsMap.clear();
1188
1189	// Emit target region as a standalone region.
1190	class NVPTXPrePostActionTy : public PrePostActionTy {
1191	CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1192	CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
1193
1194	public:
1195	NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1196	CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
1197	: EST(EST), WST(WST) {}
1198	void Enter(CodeGenFunction &CGF) override {
1199	auto &RT =
1200	static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1201	RT.emitNonSPMDEntryHeader(CGF, EST, WST);
1202	// Skip target region initialization.
1203	RT.setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);
1204	}
1205	void Exit(CodeGenFunction &CGF) override {
1206	auto &RT =
1207	static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1208	RT.clearLocThreadIdInsertPt(CGF);
1209	RT.emitNonSPMDEntryFooter(CGF, EST);
1210	}
1211	} Action(EST, WST);
1212	CodeGen.setAction(Action);
1213	IsInTTDRegion = true;
1214	// Reserve place for the globalized memory.
1215	GlobalizedRecords.emplace_back();
1216	if (!KernelStaticGlobalized) {
1217	KernelStaticGlobalized = new llvm::GlobalVariable(
1218	CGM.getModule(), CGM.VoidPtrTy, /isConstant=/false,
1219	llvm::GlobalValue::InternalLinkage,
1220	llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1221	"_openmp_kernel_static_glob_rd$ptr", /InsertBefore=/nullptr,
1222	llvm::GlobalValue::NotThreadLocal,
1223	CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1224	}
1225	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1226	IsOffloadEntry, CodeGen);
1227	IsInTTDRegion = false;
1228
1229	// Now change the name of the worker function to correspond to this target
1230	// region's entry function.
1231	WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
1232
1233	// Create the worker function
1234	emitWorkerFunction(WST);
1235	}
1236
1237	// Setup NVPTX threads for master-worker OpenMP scheme.
1238	void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1239	EntryFunctionState &EST,
1240	WorkerFunctionState &WST) {
1241	CGBuilderTy &Bld = CGF.Builder;
1242
1243	llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1244	llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1245	llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1246	EST.ExitBB = CGF.createBasicBlock(".exit");
1247
1248	llvm::Value *IsWorker =
1249	Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
1250	Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1251
1252	CGF.EmitBlock(WorkerBB);
1253	emitCall(CGF, WST.Loc, WST.WorkerFn);
1254	CGF.EmitBranch(EST.ExitBB);
1255
1256	CGF.EmitBlock(MasterCheckBB);
1257	llvm::Value *IsMaster =
1258	Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1259	Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1260
1261	CGF.EmitBlock(MasterBB);
1262	IsInTargetMasterThreadRegion = true;
1263	// SEQUENTIAL (MASTER) REGION START
1264	// First action in sequential region:
1265	// Initialize the state of the OpenMP runtime library on the GPU.
1266	// TODO: Optimize runtime initialization and pass in correct value.
1267	llvm::Value *Args[] = {getThreadLimit(CGF),
1268	Bld.getInt16(/RequiresOMPRuntime=/1)};
1269	CGF.EmitRuntimeCall(
1270	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
1271
1272	// For data sharing, we need to initialize the stack.
1273	CGF.EmitRuntimeCall(
1274	createNVPTXRuntimeFunction(
1275	OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
1276
1277	emitGenericVarsProlog(CGF, WST.Loc);
1278	}
1279
1280	void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
1281	EntryFunctionState &EST) {
1282	IsInTargetMasterThreadRegion = false;
1283	if (!CGF.HaveInsertPoint())
1284	return;
1285
1286	emitGenericVarsEpilog(CGF);
1287
1288	if (!EST.ExitBB)
1289	EST.ExitBB = CGF.createBasicBlock(".exit");
1290
1291	llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1292	CGF.EmitBranch(TerminateBB);
1293
1294	CGF.EmitBlock(TerminateBB);
1295	// Signal termination condition.
1296	// TODO: Optimize runtime initialization and pass in correct value.
1297	llvm::Value Args[] = {CGF.Builder.getInt16(/IsOMPRuntimeInitialized=*/1)};
1298	CGF.EmitRuntimeCall(
1299	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
1300	// Barrier to terminate worker threads.
1301	syncCTAThreads(CGF);
1302	// Master thread jumps to exit point.
1303	CGF.EmitBranch(EST.ExitBB);
1304
1305	CGF.EmitBlock(EST.ExitBB);
1306	EST.ExitBB = nullptr;
1307	}
1308
1309	void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
1310	StringRef ParentName,
1311	llvm::Function *&OutlinedFn,
1312	llvm::Constant *&OutlinedFnID,
1313	bool IsOffloadEntry,
1314	const RegionCodeGenTy &CodeGen) {
1315	ExecutionRuntimeModesRAII ModeRAII(
1316	CurrentExecutionMode, RequiresFullRuntime,
1317	CGM.getLangOpts().OpenMPCUDAForceFullRuntime \|\|
1318	!supportsLightweightRuntime(CGM.getContext(), D));
1319	EntryFunctionState EST;
1320
1321	// Emit target region as a standalone region.
1322	class NVPTXPrePostActionTy : public PrePostActionTy {
1323	CGOpenMPRuntimeNVPTX &RT;
1324	CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1325	const OMPExecutableDirective &D;
1326
1327	public:
1328	NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
1329	CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1330	const OMPExecutableDirective &D)
1331	: RT(RT), EST(EST), D(D) {}
1332	void Enter(CodeGenFunction &CGF) override {
1333	RT.emitSPMDEntryHeader(CGF, EST, D);
1334	// Skip target region initialization.
1335	RT.setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);
1336	}
1337	void Exit(CodeGenFunction &CGF) override {
1338	RT.clearLocThreadIdInsertPt(CGF);
1339	RT.emitSPMDEntryFooter(CGF, EST);
1340	}
1341	} Action(*this, EST, D);
1342	CodeGen.setAction(Action);
1343	IsInTTDRegion = true;
1344	// Reserve place for the globalized memory.
1345	GlobalizedRecords.emplace_back();
1346	if (!KernelStaticGlobalized) {
1347	KernelStaticGlobalized = new llvm::GlobalVariable(
1348	CGM.getModule(), CGM.VoidPtrTy, /isConstant=/false,
1349	llvm::GlobalValue::InternalLinkage,
1350	llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1351	"_openmp_kernel_static_glob_rd$ptr", /InsertBefore=/nullptr,
1352	llvm::GlobalValue::NotThreadLocal,
1353	CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1354	}
1355	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1356	IsOffloadEntry, CodeGen);
1357	IsInTTDRegion = false;
1358	}
1359
1360	void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
1361	CodeGenFunction &CGF, EntryFunctionState &EST,
1362	const OMPExecutableDirective &D) {
1363	CGBuilderTy &Bld = CGF.Builder;
1364
1365	// Setup BBs in entry function.
1366	llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1367	EST.ExitBB = CGF.createBasicBlock(".exit");
1368
1369	llvm::Value Args[] = {getThreadLimit(CGF, /IsInSPMDExecutionMode=*/true),
1370	/RequiresOMPRuntime=/
1371	Bld.getInt16(RequiresFullRuntime ? 1 : 0),
1372	/RequiresDataSharing=/Bld.getInt16(0)};
1373	CGF.EmitRuntimeCall(
1374	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
1375
1376	if (RequiresFullRuntime) {
1377	// For data sharing, we need to initialize the stack.
1378	CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1379	OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
1380	}
1381
1382	CGF.EmitBranch(ExecuteBB);
1383
1384	CGF.EmitBlock(ExecuteBB);
1385
1386	IsInTargetMasterThreadRegion = true;
1387	}
1388
1389	void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
1390	EntryFunctionState &EST) {
1391	IsInTargetMasterThreadRegion = false;
1392	if (!CGF.HaveInsertPoint())
1393	return;
1394
1395	if (!EST.ExitBB)
1396	EST.ExitBB = CGF.createBasicBlock(".exit");
1397
1398	llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1399	CGF.EmitBranch(OMPDeInitBB);
1400
1401	CGF.EmitBlock(OMPDeInitBB);
1402	// DeInitialize the OMP state in the runtime; called by all active threads.
1403	llvm::Value Args[] = {/RequiresOMPRuntime=*/
1404	CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
1405	CGF.EmitRuntimeCall(
1406	createNVPTXRuntimeFunction(
1407	OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2), Args);
1408	CGF.EmitBranch(EST.ExitBB);
1409
1410	CGF.EmitBlock(EST.ExitBB);
1411	EST.ExitBB = nullptr;
1412	}
1413
1414	// Create a unique global variable to indicate the execution mode of this target
1415	// region. The execution mode is either 'generic', or 'spmd' depending on the
1416	// target directive. This variable is picked up by the offload library to setup
1417	// the device appropriately before kernel launch. If the execution mode is
1418	// 'generic', the runtime reserves one warp for the master, otherwise, all
1419	// warps participate in parallel work.
1420	static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1421	bool Mode) {
1422	auto *GVMode =
1423	new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /isConstant=/true,
1424	llvm::GlobalValue::WeakAnyLinkage,
1425	llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1426	Twine(Name, "_exec_mode"));
1427	CGM.addCompilerUsedGlobal(GVMode);
1428	}
1429
1430	void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
1431	ASTContext &Ctx = CGM.getContext();
1432
1433	CodeGenFunction CGF(CGM, /suppressNewContext=/true);
1434	CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
1435	WST.Loc, WST.Loc);
1436	emitWorkerLoop(CGF, WST);
1437	CGF.FinishFunction();
1438	}
1439
1440	void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
1441	WorkerFunctionState &WST) {
1442	//
1443	// The workers enter this loop and wait for parallel work from the master.
1444	// When the master encounters a parallel region it sets up the work + variable
1445	// arguments, and wakes up the workers. The workers first check to see if
1446	// they are required for the parallel region, i.e., within the # of requested
1447	// parallel threads. The activated workers load the variable arguments and
1448	// execute the parallel work.
1449	//
1450
1451	CGBuilderTy &Bld = CGF.Builder;
1452
1453	llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1454	llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1455	llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1456	llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1457	llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1458	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1459
1460	CGF.EmitBranch(AwaitBB);
1461
1462	// Workers wait for work from master.
1463	CGF.EmitBlock(AwaitBB);
1464	// Wait for parallel work
1465	syncCTAThreads(CGF);
1466
1467	Address WorkFn =
1468	CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /Name=/"work_fn");
1469	Address ExecStatus =
1470	CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /Name=/"exec_status");
1471	CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/C=/0));
1472	CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1473
1474	// TODO: Optimize runtime initialization and pass in correct value.
1475	llvm::Value *Args[] = {WorkFn.getPointer(),
1476	/RequiresOMPRuntime=/Bld.getInt16(1)};
1477	llvm::Value *Ret = CGF.EmitRuntimeCall(
1478	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
1479	Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
1480
1481	// On termination condition (workid == 0), exit loop.
1482	llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1483	llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
1484	Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1485
1486	// Activate requested workers.
1487	CGF.EmitBlock(SelectWorkersBB);
1488	llvm::Value *IsActive =
1489	Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1490	Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
1491
1492	// Signal start of parallel region.
1493	CGF.EmitBlock(ExecuteBB);
1494	// Skip initialization.
1495	setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);
1496
1497	// Process work items: outlined parallel functions.
1498	for (llvm::Function *W : Work) {
1499	// Try to match this outlined function.
1500	llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
1501
1502	llvm::Value *WorkFnMatch =
1503	Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1504
1505	llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1506	llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1507	Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1508
1509	// Execute this outlined function.
1510	CGF.EmitBlock(ExecuteFNBB);
1511
1512	// Insert call to work function via shared wrapper. The shared
1513	// wrapper takes two arguments:
1514	// - the parallelism level;
1515	// - the thread ID;
1516	emitCall(CGF, WST.Loc, W,
1517	{Bld.getInt16(/ParallelLevel=/0), getThreadID(CGF, WST.Loc)});
1518
1519	// Go to end of parallel region.
1520	CGF.EmitBranch(TerminateBB);
1521
1522	CGF.EmitBlock(CheckNextBB);
1523	}
1524	// Default case: call to outlined function through pointer if the target
1525	// region makes a declare target call that may contain an orphaned parallel
1526	// directive.
1527	auto *ParallelFnTy =
1528	llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1529	/isVarArg=/false);
1530	llvm::Value *WorkFnCast =
1531	Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo());
1532	// Insert call to work function via shared wrapper. The shared
1533	// wrapper takes two arguments:
1534	// - the parallelism level;
1535	// - the thread ID;
1536	emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast},
1537	{Bld.getInt16(/ParallelLevel=/0), getThreadID(CGF, WST.Loc)});
1538	// Go to end of parallel region.
1539	CGF.EmitBranch(TerminateBB);
1540
1541	// Signal end of parallel region.
1542	CGF.EmitBlock(TerminateBB);
1543	CGF.EmitRuntimeCall(
1544	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
1545	llvm::None);
1546	CGF.EmitBranch(BarrierBB);
1547
1548	// All active and inactive workers wait at a barrier after parallel region.
1549	CGF.EmitBlock(BarrierBB);
1550	// Barrier after parallel region.
1551	syncCTAThreads(CGF);
1552	CGF.EmitBranch(AwaitBB);
1553
1554	// Exit target region.
1555	CGF.EmitBlock(ExitBB);
1556	// Skip initialization.
1557	clearLocThreadIdInsertPt(CGF);
1558	}
1559
1560	/// Returns specified OpenMP runtime function for the current OpenMP
1561	/// implementation. Specialized for the NVPTX device.
1562	/// \param Function OpenMP runtime function.
1563	/// \return Specified function.
1564	llvm::FunctionCallee
1565	CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
1566	llvm::FunctionCallee RTLFn = nullptr;
1567	switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
1568	case OMPRTL_NVPTX__kmpc_kernel_init: {
1569	// Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
1570	// RequiresOMPRuntime);
1571	llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
1572	auto *FnTy =
1573	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
1574	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
1575	break;
1576	}
1577	case OMPRTL_NVPTX__kmpc_kernel_deinit: {
1578	// Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
1579	llvm::Type *TypeParams[] = {CGM.Int16Ty};
1580	auto *FnTy =
1581	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
1582	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
1583	break;
1584	}
1585	case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
1586	// Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
1587	// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
1588	llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1589	auto *FnTy =
1590	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
1591	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
1592	break;
1593	}
1594	case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2: {
1595	// Build void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
1596	llvm::Type *TypeParams[] = {CGM.Int16Ty};
1597	auto *FnTy =
1598	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
1599	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit_v2");
1600	break;
1601	}
1602	case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
1603	/// Build void __kmpc_kernel_prepare_parallel(
1604	/// void *outlined_function, int16_t IsOMPRuntimeInitialized);
1605	llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
1606	auto *FnTy =
1607	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
1608	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
1609	break;
1610	}
1611	case OMPRTL_NVPTX__kmpc_kernel_parallel: {
1612	/// Build bool __kmpc_kernel_parallel(void **outlined_function,
1613	/// int16_t IsOMPRuntimeInitialized);
1614	llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
1615	llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
1616	auto *FnTy =
1617	llvm::FunctionType::get(RetTy, TypeParams, /isVarArg/ false);
1618	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
1619	break;
1620	}
1621	case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
1622	/// Build void __kmpc_kernel_end_parallel();
1623	auto *FnTy =
1624	llvm::FunctionType::get(CGM.VoidTy, llvm::None, /isVarArg/ false);
1625	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
1626	break;
1627	}
1628	case OMPRTL_NVPTX__kmpc_serialized_parallel: {
1629	// Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
1630	// global_tid);
1631	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1632	auto *FnTy =
1633	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
1634	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
1635	break;
1636	}
1637	case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
1638	// Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
1639	// global_tid);
1640	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1641	auto *FnTy =
1642	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
1643	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
1644	break;
1645	}
1646	case OMPRTL_NVPTX__kmpc_shuffle_int32: {
1647	// Build int32_t __kmpc_shuffle_int32(int32_t element,
1648	// int16_t lane_offset, int16_t warp_size);
1649	llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1650	auto *FnTy =
1651	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg/ false);
1652	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
1653	break;
1654	}
1655	case OMPRTL_NVPTX__kmpc_shuffle_int64: {
1656	// Build int64_t __kmpc_shuffle_int64(int64_t element,
1657	// int16_t lane_offset, int16_t warp_size);
1658	llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
1659	auto *FnTy =
1660	llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /isVarArg/ false);
1661	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
1662	break;
1663	}
1664	case OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2: {
1665	// Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc,
1666	// kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void*
1667	// reduce_data, void (kmp_ShuffleReductFctPtr)(void rhsData, int16_t
1668	// lane_id, int16_t lane_offset, int16_t Algorithm Version), void
1669	// (kmp_InterWarpCopyFctPtr)(void src, int warp_num));
1670	llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1671	CGM.Int16Ty, CGM.Int16Ty};
1672	auto *ShuffleReduceFnTy =
1673	llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1674	/isVarArg=/false);
1675	llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1676	auto *InterWarpCopyFnTy =
1677	llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1678	/isVarArg=/false);
1679	llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
1680	CGM.Int32Ty,
1681	CGM.Int32Ty,
1682	CGM.SizeTy,
1683	CGM.VoidPtrTy,
1684	ShuffleReduceFnTy->getPointerTo(),
1685	InterWarpCopyFnTy->getPointerTo()};
1686	auto *FnTy =
1687	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg=/false);
1688	RTLFn = CGM.CreateRuntimeFunction(
1689	FnTy, /Name=/"__kmpc_nvptx_parallel_reduce_nowait_v2");
1690	break;
1691	}
1692	case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1693	// Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1694	llvm::Type *TypeParams[] = {CGM.Int32Ty};
1695	auto *FnTy =
1696	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
1697	RTLFn = CGM.CreateRuntimeFunction(
1698	FnTy, /Name=/"__kmpc_nvptx_end_reduce_nowait");
1699	break;
1700	}
1701	case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2: {
1702	// Build int32_t __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
1703	// global_tid, void global_buffer, int32_t num_of_records, void
1704	// reduce_data,
1705	// void (kmp_ShuffleReductFctPtr)(void rhsData, int16_t lane_id, int16_t
1706	// lane_offset, int16_t shortCircuit),
1707	// void (kmp_InterWarpCopyFctPtr)(void src, int32_t warp_num), void
1708	// (kmp_ListToGlobalCpyFctPtr)(void buffer, int idx, void *reduce_data),
1709	// void (kmp_GlobalToListCpyFctPtr)(void buffer, int idx,
1710	// void reduce_data), void (kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
1711	// int idx, void reduce_data), void (kmp_GlobalToListRedFctPtr)(void
1712	// buffer, int idx, void reduce_data));
1713	llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1714	CGM.Int16Ty, CGM.Int16Ty};
1715	auto *ShuffleReduceFnTy =
1716	llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1717	/isVarArg=/false);
1718	llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1719	auto *InterWarpCopyFnTy =
1720	llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1721	/isVarArg=/false);
1722	llvm::Type *GlobalListTypeParams[] = {CGM.VoidPtrTy, CGM.IntTy,
1723	CGM.VoidPtrTy};
1724	auto *GlobalListFnTy =
1725	llvm::FunctionType::get(CGM.VoidTy, GlobalListTypeParams,
1726	/isVarArg=/false);
1727	llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
1728	CGM.Int32Ty,
1729	CGM.VoidPtrTy,
1730	CGM.Int32Ty,
1731	CGM.VoidPtrTy,
1732	ShuffleReduceFnTy->getPointerTo(),
1733	InterWarpCopyFnTy->getPointerTo(),
1734	GlobalListFnTy->getPointerTo(),
1735	GlobalListFnTy->getPointerTo(),
1736	GlobalListFnTy->getPointerTo(),
1737	GlobalListFnTy->getPointerTo()};
1738	auto *FnTy =
1739	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg=/false);
1740	RTLFn = CGM.CreateRuntimeFunction(
1741	FnTy, /Name=/"__kmpc_nvptx_teams_reduce_nowait_v2");
1742	break;
1743	}
1744	case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1745	/// Build void __kmpc_data_sharing_init_stack();
1746	auto *FnTy =
1747	llvm::FunctionType::get(CGM.VoidTy, llvm::None, /isVarArg/ false);
1748	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1749	break;
1750	}
1751	case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
1752	/// Build void __kmpc_data_sharing_init_stack_spmd();
1753	auto *FnTy =
1754	llvm::FunctionType::get(CGM.VoidTy, llvm::None, /isVarArg/ false);
1755	RTLFn =
1756	CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
1757	break;
1758	}
1759	case OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack: {
1760	// Build void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
1761	// int16_t UseSharedMemory);
1762	llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
1763	auto *FnTy =
1764	llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /isVarArg=/false);
1765	RTLFn = CGM.CreateRuntimeFunction(
1766	FnTy, /Name=/"__kmpc_data_sharing_coalesced_push_stack");
1767	break;
1768	}
1769	case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1770	// Build void __kmpc_data_sharing_pop_stack(void *a);
1771	llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
1772	auto *FnTy =
1773	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
1774	RTLFn = CGM.CreateRuntimeFunction(FnTy,
1775	/Name=/"__kmpc_data_sharing_pop_stack");
1776	break;
1777	}
1778	case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1779	/// Build void __kmpc_begin_sharing_variables(void ***args,
1780	/// size_t n_args);
1781	llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
1782	auto *FnTy =
1783	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
1784	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1785	break;
1786	}
1787	case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1788	/// Build void __kmpc_end_sharing_variables();
1789	auto *FnTy =
1790	llvm::FunctionType::get(CGM.VoidTy, llvm::None, /isVarArg/ false);
1791	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1792	break;
1793	}
1794	case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1795	/// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1796	llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
1797	auto *FnTy =
1798	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
1799	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1800	break;
1801	}
1802	case OMPRTL_NVPTX__kmpc_parallel_level: {
1803	// Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
1804	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1805	auto *FnTy =
1806	llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /isVarArg/ false);
1807	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
1808	break;
1809	}
1810	case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
1811	// Build int8_t __kmpc_is_spmd_exec_mode();
1812	auto FnTy = llvm::FunctionType::get(CGM.Int8Ty, /isVarArg=*/false);
1813	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
1814	break;
1815	}
1816	case OMPRTL_NVPTX__kmpc_get_team_static_memory: {
1817	// Build void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
1818	// const void buf, size_t size, int16_t is_shared, const void *res);
1819	llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.VoidPtrTy, CGM.SizeTy,
1820	CGM.Int16Ty, CGM.VoidPtrPtrTy};
1821	auto *FnTy =
1822	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
1823	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory");
1824	break;
1825	}
1826	case OMPRTL_NVPTX__kmpc_restore_team_static_memory: {
1827	// Build void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
1828	// int16_t is_shared);
1829	llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty};
1830	auto *FnTy =
1831	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
1832	RTLFn =
1833	CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory");
1834	break;
1835	}
1836	case OMPRTL__kmpc_barrier: {
1837	// Build void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
1838	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1839	auto *FnTy =
1840	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
1841	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name/ "__kmpc_barrier");
1842	cast<llvm::Function>(RTLFn.getCallee())
1843	->addFnAttr(llvm::Attribute::Convergent);
1844	break;
1845	}
1846	case OMPRTL__kmpc_barrier_simple_spmd: {
1847	// Build void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
1848	// global_tid);
1849	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1850	auto *FnTy =
1851	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
1852	RTLFn =
1853	CGM.CreateRuntimeFunction(FnTy, /Name/ "__kmpc_barrier_simple_spmd");
1854	cast<llvm::Function>(RTLFn.getCallee())
1855	->addFnAttr(llvm::Attribute::Convergent);
1856	break;
1857	}
1858	}
1859	return RTLFn;
1860	}
1861
1862	void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1863	llvm::Constant *Addr,
1864	uint64_t Size, int32_t,
1865	llvm::GlobalValue::LinkageTypes) {
1866	// TODO: Add support for global variables on the device after declare target
1867	// support.
1868	if (!isa<llvm::Function>(Addr))
1869	return;
1870	llvm::Module &M = CGM.getModule();
1871	llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1872
1873	// Get "nvvm.annotations" metadata node
1874	llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1875
1876	llvm::Metadata *MDVals[] = {
1877	llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1878	llvm::ConstantAsMetadata::get(
1879	llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1880	// Append metadata to nvvm.annotations
1881	MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1882	}
1883
1884	void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1885	const OMPExecutableDirective &D, StringRef ParentName,
1886	llvm::Function &OutlinedFn, llvm::Constant &OutlinedFnID,
1887	bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1888	if (!IsOffloadEntry) // Nothing to do.
1889	return;
1890
1891	(0) . __assert_fail ("!ParentName.empty() && \"Invalid target region parent name!\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 1891, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(!ParentName.empty() && "Invalid target region parent name!");
1892
1893	bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1894	if (Mode)
1895	emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1896	CodeGen);
1897	else
1898	emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1899	CodeGen);
1900
1901	setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1902	}
1903
1904	namespace {
1905	LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
1906	/// Enum for accesseing the reserved_2 field of the ident_t struct.
1907	enum ModeFlagsTy : unsigned {
1908	/// Bit set to 1 when in SPMD mode.
1909	KMP_IDENT_SPMD_MODE = 0x01,
1910	/// Bit set to 1 when a simplified runtime is used.
1911	KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1912	LLVM_MARK_AS_BITMASK_ENUM(/LargestValue=/KMP_IDENT_SIMPLE_RT_MODE)
1913	};
1914
1915	/// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1916	static const ModeFlagsTy UndefinedMode =
1917	(~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1918	} // anonymous namespace
1919
1920	unsigned CGOpenMPRuntimeNVPTX::getDefaultLocationReserved2Flags() const {
1921	switch (getExecutionMode()) {
1922	case EM_SPMD:
1923	if (requiresFullRuntime())
1924	return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1925	return KMP_IDENT_SPMD_MODE \| KMP_IDENT_SIMPLE_RT_MODE;
1926	case EM_NonSPMD:
1927	(0) . __assert_fail ("requiresFullRuntime() && \"Expected full runtime.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 1927, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(requiresFullRuntime() && "Expected full runtime.");
1928	return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1929	case EM_Unknown:
1930	return UndefinedMode;
1931	}
1932	llvm_unreachable("Unknown flags are requested.");
1933	}
1934
1935	CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
1936	: CGOpenMPRuntime(CGM, "_", "$") {
1937	if (!CGM.getLangOpts().OpenMPIsDevice)
1938	llvm_unreachable("OpenMP NVPTX can only handle device code.");
1939	}
1940
1941	void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
1942	OpenMPProcBindClauseKind ProcBind,
1943	SourceLocation Loc) {
1944	// Do nothing in case of SPMD mode and L0 parallel.
1945	if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1946	return;
1947
1948	CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1949	}
1950
1951	void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
1952	llvm::Value *NumThreads,
1953	SourceLocation Loc) {
1954	// Do nothing in case of SPMD mode and L0 parallel.
1955	if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1956	return;
1957
1958	CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1959	}
1960
1961	void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
1962	const Expr *NumTeams,
1963	const Expr *ThreadLimit,
1964	SourceLocation Loc) {}
1965
1966	llvm::Function *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
1967	const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1968	OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1969	// Emit target region as a standalone region.
1970	class NVPTXPrePostActionTy : public PrePostActionTy {
1971	bool &IsInParallelRegion;
1972	bool PrevIsInParallelRegion;
1973
1974	public:
1975	NVPTXPrePostActionTy(bool &IsInParallelRegion)
1976	: IsInParallelRegion(IsInParallelRegion) {}
1977	void Enter(CodeGenFunction &CGF) override {
1978	PrevIsInParallelRegion = IsInParallelRegion;
1979	IsInParallelRegion = true;
1980	}
1981	void Exit(CodeGenFunction &CGF) override {
1982	IsInParallelRegion = PrevIsInParallelRegion;
1983	}
1984	} Action(IsInParallelRegion);
1985	CodeGen.setAction(Action);
1986	bool PrevIsInTTDRegion = IsInTTDRegion;
1987	IsInTTDRegion = false;
1988	bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1989	IsInTargetMasterThreadRegion = false;
1990	auto *OutlinedFun =
1991	cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1992	D, ThreadIDVar, InnermostKind, CodeGen));
1993	IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1994	IsInTTDRegion = PrevIsInTTDRegion;
1995	if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
1996	!IsInParallelRegion) {
1997	llvm::Function *WrapperFun =
1998	createParallelDataSharingWrapper(OutlinedFun, D);
1999	WrapperFunctionsMap[OutlinedFun] = WrapperFun;
2000	}
2001
2002	return OutlinedFun;
2003	}
2004
2005	/// Get list of lastprivate variables from the teams distribute ... or
2006	/// teams {distribute ...} directives.
2007	static void
2008	getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
2009	llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
2010	(0) . __assert_fail ("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2011, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
2011	(0) . __assert_fail ("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2011, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "expected teams directive.");
2012	const OMPExecutableDirective *Dir = &D;
2013	if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
2014	if (const Stmt *S = getSingleCompoundChild(
2015	Ctx,
2016	D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
2017	/IgnoreCaptured=/true))) {
2018	Dir = dyn_cast<OMPExecutableDirective>(S);
2019	if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
2020	Dir = nullptr;
2021	}
2022	}
2023	if (!Dir)
2024	return;
2025	for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
2026	for (const Expr *E : C->getVarRefs())
2027	Vars.push_back(getPrivateItem(E));
2028	}
2029	}
2030
2031	/// Get list of reduction variables from the teams ... directives.
2032	static void
2033	getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
2034	llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
2035	(0) . __assert_fail ("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2036, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
2036	(0) . __assert_fail ("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2036, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "expected teams directive.");
2037	for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
2038	for (const Expr *E : C->privates())
2039	Vars.push_back(getPrivateItem(E));
2040	}
2041	}
2042
2043	llvm::Function *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
2044	const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
2045	OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
2046	SourceLocation Loc = D.getBeginLoc();
2047
2048	const RecordDecl *GlobalizedRD = nullptr;
2049	llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
2050	llvm::SmallDenseMap<const ValueDecl , const FieldDecl > MappedDeclsFields;
2051	// Globalize team reductions variable unconditionally in all modes.
2052	if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2053	getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
2054	if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
2055	getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
2056	if (!LastPrivatesReductions.empty()) {
2057	GlobalizedRD = ::buildRecordForGlobalizedVars(
2058	CGM.getContext(), llvm::None, LastPrivatesReductions,
2059	MappedDeclsFields, WarpSize);
2060	}
2061	} else if (!LastPrivatesReductions.empty()) {
2062	(0) . __assert_fail ("!TeamAndReductions.first && \"Previous team declaration is not expected.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2063, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(!TeamAndReductions.first &&
2063	(0) . __assert_fail ("!TeamAndReductions.first && \"Previous team declaration is not expected.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2063, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Previous team declaration is not expected.");
2064	TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
2065	std::swap(TeamAndReductions.second, LastPrivatesReductions);
2066	}
2067
2068	// Emit target region as a standalone region.
2069	class NVPTXPrePostActionTy : public PrePostActionTy {
2070	SourceLocation &Loc;
2071	const RecordDecl *GlobalizedRD;
2072	llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
2073	&MappedDeclsFields;
2074
2075	public:
2076	NVPTXPrePostActionTy(
2077	SourceLocation &Loc, const RecordDecl *GlobalizedRD,
2078	llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
2079	&MappedDeclsFields)
2080	: Loc(Loc), GlobalizedRD(GlobalizedRD),
2081	MappedDeclsFields(MappedDeclsFields) {}
2082	void Enter(CodeGenFunction &CGF) override {
2083	auto &Rt =
2084	static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
2085	if (GlobalizedRD) {
2086	auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2087	I->getSecond().GlobalRecord = GlobalizedRD;
2088	I->getSecond().MappedParams =
2089	llvm::make_unique<CodeGenFunction::OMPMapVars>();
2090	DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2091	for (const auto &Pair : MappedDeclsFields) {
2092	(0) . __assert_fail ("Pair.getFirst()->isCanonicalDecl() && \"Expected canonical declaration\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2093, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(Pair.getFirst()->isCanonicalDecl() &&
2093	(0) . __assert_fail ("Pair.getFirst()->isCanonicalDecl() && \"Expected canonical declaration\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2093, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Expected canonical declaration");
2094	Data.insert(std::make_pair(Pair.getFirst(),
2095	MappedVarData(Pair.getSecond(),
2096	/IsOnePerTeam=/true)));
2097	}
2098	}
2099	Rt.emitGenericVarsProlog(CGF, Loc);
2100	}
2101	void Exit(CodeGenFunction &CGF) override {
2102	static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
2103	.emitGenericVarsEpilog(CGF);
2104	}
2105	} Action(Loc, GlobalizedRD, MappedDeclsFields);
2106	CodeGen.setAction(Action);
2107	llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
2108	D, ThreadIDVar, InnermostKind, CodeGen);
2109	OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
2110	OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
2111	OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
2112
2113	return OutlinedFun;
2114	}
2115
2116	void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
2117	SourceLocation Loc,
2118	bool WithSPMDCheck) {
2119	if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
2120	getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2121	return;
2122
2123	CGBuilderTy &Bld = CGF.Builder;
2124
2125	const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2126	if (I == FunctionGlobalizedDecls.end())
2127	return;
2128	if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
2129	QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
2130	QualType SecGlobalRecTy;
2131
2132	// Recover pointer to this function's global record. The runtime will
2133	// handle the specifics of the allocation of the memory.
2134	// Use actual memory size of the record including the padding
2135	// for alignment purposes.
2136	unsigned Alignment =
2137	CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2138	unsigned GlobalRecordSize =
2139	CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
2140	GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2141
2142	llvm::PointerType *GlobalRecPtrTy =
2143	CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
2144	llvm::Value *GlobalRecCastAddr;
2145	llvm::Value *IsTTD = nullptr;
2146	if (!IsInTTDRegion &&
2147	(WithSPMDCheck \|\|
2148	getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2149	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2150	llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
2151	llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2152	if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
2153	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2154	llvm::Value *ThreadID = getThreadID(CGF, Loc);
2155	llvm::Value *PL = CGF.EmitRuntimeCall(
2156	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2157	{RTLoc, ThreadID});
2158	IsTTD = Bld.CreateIsNull(PL);
2159	}
2160	llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2161	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2162	Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
2163	// There is no need to emit line number for unconditional branch.
2164	(void)ApplyDebugLocation::CreateEmpty(CGF);
2165	CGF.EmitBlock(SPMDBB);
2166	Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
2167	CharUnits::fromQuantity(Alignment));
2168	CGF.EmitBranch(ExitBB);
2169	// There is no need to emit line number for unconditional branch.
2170	(void)ApplyDebugLocation::CreateEmpty(CGF);
2171	CGF.EmitBlock(NonSPMDBB);
2172	llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
2173	if (const RecordDecl *SecGlobalizedVarsRecord =
2174	I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
2175	SecGlobalRecTy =
2176	CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
2177
2178	// Recover pointer to this function's global record. The runtime will
2179	// handle the specifics of the allocation of the memory.
2180	// Use actual memory size of the record including the padding
2181	// for alignment purposes.
2182	unsigned Alignment =
2183	CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
2184	unsigned GlobalRecordSize =
2185	CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
2186	GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2187	Size = Bld.CreateSelect(
2188	IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
2189	}
2190	// TODO: allow the usage of shared memory to be controlled by
2191	// the user, for now, default to global.
2192	llvm::Value *GlobalRecordSizeArg[] = {
2193	Size, CGF.Builder.getInt16(/UseSharedMemory=/0)};
2194	llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2195	createNVPTXRuntimeFunction(
2196	OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2197	GlobalRecordSizeArg);
2198	GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2199	GlobalRecValue, GlobalRecPtrTy);
2200	CGF.EmitBlock(ExitBB);
2201	auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
2202	/NumReservedValues=/2, "_select_stack");
2203	Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
2204	Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
2205	GlobalRecCastAddr = Phi;
2206	I->getSecond().GlobalRecordAddr = Phi;
2207	I->getSecond().IsInSPMDModeFlag = IsSPMD;
2208	} else if (IsInTTDRegion) {
2209	(0) . __assert_fail ("GlobalizedRecords.back().Records.size() < 2 && \"Expected less than 2 globalized records. one for target and one \" \"for teams.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2211, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(GlobalizedRecords.back().Records.size() < 2 &&
2210	(0) . __assert_fail ("GlobalizedRecords.back().Records.size() < 2 && \"Expected less than 2 globalized records. one for target and one \" \"for teams.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2211, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Expected less than 2 globalized records: one for target and one "
2211	(0) . __assert_fail ("GlobalizedRecords.back().Records.size() < 2 && \"Expected less than 2 globalized records. one for target and one \" \"for teams.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2211, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "for teams.");
2212	unsigned Offset = 0;
2213	for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
2214	QualType RDTy = CGM.getContext().getRecordType(RD);
2215	unsigned Alignment =
2216	CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
2217	unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
2218	Offset =
2219	llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
2220	}
2221	unsigned Alignment =
2222	CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2223	Offset = llvm::alignTo(Offset, Alignment);
2224	GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
2225	++GlobalizedRecords.back().RegionCounter;
2226	if (GlobalizedRecords.back().Records.size() == 1) {
2227	(0) . __assert_fail ("KernelStaticGlobalized && \"Kernel static pointer must be initialized already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2228, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(KernelStaticGlobalized &&
2228	(0) . __assert_fail ("KernelStaticGlobalized && \"Kernel static pointer must be initialized already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2228, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Kernel static pointer must be initialized already.");
2229	auto *UseSharedMemory = new llvm::GlobalVariable(
2230	CGM.getModule(), CGM.Int16Ty, /isConstant=/true,
2231	llvm::GlobalValue::InternalLinkage, nullptr,
2232	"_openmp_static_kernel$is_shared");
2233	UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2234	QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2235	/DestWidth=/16, /Signed=/0);
2236	llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2237	Address(UseSharedMemory,
2238	CGM.getContext().getTypeAlignInChars(Int16Ty)),
2239	/Volatile=/false, Int16Ty, Loc);
2240	auto *StaticGlobalized = new llvm::GlobalVariable(
2241	CGM.getModule(), CGM.Int8Ty, /isConstant=/false,
2242	llvm::GlobalValue::CommonLinkage, nullptr);
2243	auto *RecSize = new llvm::GlobalVariable(
2244	CGM.getModule(), CGM.SizeTy, /isConstant=/true,
2245	llvm::GlobalValue::InternalLinkage, nullptr,
2246	"_openmp_static_kernel$size");
2247	RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2248	llvm::Value *Ld = CGF.EmitLoadOfScalar(
2249	Address(RecSize, CGM.getSizeAlign()), /Volatile=/false,
2250	CGM.getContext().getSizeType(), Loc);
2251	llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2252	KernelStaticGlobalized, CGM.VoidPtrPtrTy);
2253	llvm::Value *GlobalRecordSizeArg[] = {
2254	llvm::ConstantInt::get(
2255	CGM.Int16Ty,
2256	getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2257	StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
2258	CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2259	OMPRTL_NVPTX__kmpc_get_team_static_memory),
2260	GlobalRecordSizeArg);
2261	GlobalizedRecords.back().Buffer = StaticGlobalized;
2262	GlobalizedRecords.back().RecSize = RecSize;
2263	GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
2264	GlobalizedRecords.back().Loc = Loc;
2265	}
2266	(0) . __assert_fail ("KernelStaticGlobalized && \"Global address must be set already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2266, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(KernelStaticGlobalized && "Global address must be set already.");
2267	Address FrameAddr = CGF.EmitLoadOfPointer(
2268	Address(KernelStaticGlobalized, CGM.getPointerAlign()),
2269	CGM.getContext()
2270	.getPointerType(CGM.getContext().VoidPtrTy)
2271	.castAs<PointerType>());
2272	llvm::Value *GlobalRecValue =
2273	Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer();
2274	I->getSecond().GlobalRecordAddr = GlobalRecValue;
2275	I->getSecond().IsInSPMDModeFlag = nullptr;
2276	GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2277	GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
2278	} else {
2279	// TODO: allow the usage of shared memory to be controlled by
2280	// the user, for now, default to global.
2281	llvm::Value *GlobalRecordSizeArg[] = {
2282	llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
2283	CGF.Builder.getInt16(/UseSharedMemory=/0)};
2284	llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2285	createNVPTXRuntimeFunction(
2286	OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2287	GlobalRecordSizeArg);
2288	GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2289	GlobalRecValue, GlobalRecPtrTy);
2290	I->getSecond().GlobalRecordAddr = GlobalRecValue;
2291	I->getSecond().IsInSPMDModeFlag = nullptr;
2292	}
2293	LValue Base =
2294	CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
2295
2296	// Emit the "global alloca" which is a GEP from the global declaration
2297	// record using the pointer returned by the runtime.
2298	LValue SecBase;
2299	decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
2300	if (IsTTD) {
2301	SecIt = I->getSecond().SecondaryLocalVarData->begin();
2302	llvm::PointerType *SecGlobalRecPtrTy =
2303	CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
2304	SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
2305	Bld.CreatePointerBitCastOrAddrSpaceCast(
2306	I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
2307	SecGlobalRecTy);
2308	}
2309	for (auto &Rec : I->getSecond().LocalVarData) {
2310	bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
2311	llvm::Value *ParValue;
2312	if (EscapedParam) {
2313	const auto *VD = cast<VarDecl>(Rec.first);
2314	LValue ParLVal =
2315	CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
2316	ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
2317	}
2318	LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
2319	// Emit VarAddr basing on lane-id if required.
2320	QualType VarTy;
2321	if (Rec.second.IsOnePerTeam) {
2322	VarTy = Rec.second.FD->getType();
2323	} else {
2324	llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
2325	VarAddr.getAddress().getPointer(),
2326	{Bld.getInt32(0), getNVPTXLaneID(CGF)});
2327	VarTy =
2328	Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
2329	VarAddr = CGF.MakeAddrLValue(
2330	Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
2331	AlignmentSource::Decl);
2332	}
2333	Rec.second.PrivateAddr = VarAddr.getAddress();
2334	if (!IsInTTDRegion &&
2335	(WithSPMDCheck \|\|
2336	getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2337	(0) . __assert_fail ("I->getSecond().IsInSPMDModeFlag && \"Expected unknown execution mode or required SPMD check.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2338, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(I->getSecond().IsInSPMDModeFlag &&
2338	(0) . __assert_fail ("I->getSecond().IsInSPMDModeFlag && \"Expected unknown execution mode or required SPMD check.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2338, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Expected unknown execution mode or required SPMD check.");
2339	if (IsTTD) {
2340	(0) . __assert_fail ("SecIt->second.IsOnePerTeam && \"Secondary glob data must be one per team.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2341, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(SecIt->second.IsOnePerTeam &&
2341	(0) . __assert_fail ("SecIt->second.IsOnePerTeam && \"Secondary glob data must be one per team.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2341, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Secondary glob data must be one per team.");
2342	LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
2343	VarAddr.setAddress(
2344	Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(),
2345	VarAddr.getPointer()),
2346	VarAddr.getAlignment()));
2347	Rec.second.PrivateAddr = VarAddr.getAddress();
2348	}
2349	Address GlobalPtr = Rec.second.PrivateAddr;
2350	Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
2351	Rec.second.PrivateAddr = Address(
2352	Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
2353	LocalAddr.getPointer(), GlobalPtr.getPointer()),
2354	LocalAddr.getAlignment());
2355	}
2356	if (EscapedParam) {
2357	const auto *VD = cast<VarDecl>(Rec.first);
2358	CGF.EmitStoreOfScalar(ParValue, VarAddr);
2359	I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
2360	}
2361	if (IsTTD)
2362	++SecIt;
2363	}
2364	}
2365	for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
2366	// Recover pointer to this function's global record. The runtime will
2367	// handle the specifics of the allocation of the memory.
2368	// Use actual memory size of the record including the padding
2369	// for alignment purposes.
2370	CGBuilderTy &Bld = CGF.Builder;
2371	llvm::Value *Size = CGF.getTypeSize(VD->getType());
2372	CharUnits Align = CGM.getContext().getDeclAlign(VD);
2373	Size = Bld.CreateNUWAdd(
2374	Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
2375	llvm::Value *AlignVal =
2376	llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
2377	Size = Bld.CreateUDiv(Size, AlignVal);
2378	Size = Bld.CreateNUWMul(Size, AlignVal);
2379	// TODO: allow the usage of shared memory to be controlled by
2380	// the user, for now, default to global.
2381	llvm::Value *GlobalRecordSizeArg[] = {
2382	Size, CGF.Builder.getInt16(/UseSharedMemory=/0)};
2383	llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2384	createNVPTXRuntimeFunction(
2385	OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2386	GlobalRecordSizeArg);
2387	llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2388	GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
2389	LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
2390	CGM.getContext().getDeclAlign(VD),
2391	AlignmentSource::Decl);
2392	I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
2393	Base.getAddress());
2394	I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
2395	}
2396	I->getSecond().MappedParams->apply(CGF);
2397	}
2398
2399	void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
2400	bool WithSPMDCheck) {
2401	if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
2402	getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2403	return;
2404
2405	const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2406	if (I != FunctionGlobalizedDecls.end()) {
2407	I->getSecond().MappedParams->restore(CGF);
2408	if (!CGF.HaveInsertPoint())
2409	return;
2410	for (llvm::Value *Addr :
2411	llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2412	CGF.EmitRuntimeCall(
2413	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2414	Addr);
2415	}
2416	if (I->getSecond().GlobalRecordAddr) {
2417	if (!IsInTTDRegion &&
2418	(WithSPMDCheck \|\|
2419	getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2420	CGBuilderTy &Bld = CGF.Builder;
2421	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2422	llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2423	Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2424	// There is no need to emit line number for unconditional branch.
2425	(void)ApplyDebugLocation::CreateEmpty(CGF);
2426	CGF.EmitBlock(NonSPMDBB);
2427	CGF.EmitRuntimeCall(
2428	createNVPTXRuntimeFunction(
2429	OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2430	CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2431	CGF.EmitBlock(ExitBB);
2432	} else if (IsInTTDRegion) {
2433	0.") ? static_cast (0) . __assert_fail ("GlobalizedRecords.back().RegionCounter > 0 && \"region counter must be > 0.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2434, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(GlobalizedRecords.back().RegionCounter > 0 &&
2434	0.") ? static_cast (0) . __assert_fail ("GlobalizedRecords.back().RegionCounter > 0 && \"region counter must be > 0.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2434, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "region counter must be > 0.");
2435	--GlobalizedRecords.back().RegionCounter;
2436	// Emit the restore function only in the target region.
2437	if (GlobalizedRecords.back().RegionCounter == 0) {
2438	QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2439	/DestWidth=/16, /Signed=/0);
2440	llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2441	Address(GlobalizedRecords.back().UseSharedMemory,
2442	CGM.getContext().getTypeAlignInChars(Int16Ty)),
2443	/Volatile=/false, Int16Ty, GlobalizedRecords.back().Loc);
2444	llvm::Value *Args[] = {
2445	llvm::ConstantInt::get(
2446	CGM.Int16Ty,
2447	getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2448	IsInSharedMemory};
2449	CGF.EmitRuntimeCall(
2450	createNVPTXRuntimeFunction(
2451	OMPRTL_NVPTX__kmpc_restore_team_static_memory),
2452	Args);
2453	}
2454	} else {
2455	CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2456	OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2457	I->getSecond().GlobalRecordAddr);
2458	}
2459	}
2460	}
2461	}
2462
2463	void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
2464	const OMPExecutableDirective &D,
2465	SourceLocation Loc,
2466	llvm::Function *OutlinedFn,
2467	ArrayRef<llvm::Value *> CapturedVars) {
2468	if (!CGF.HaveInsertPoint())
2469	return;
2470
2471	Address ZeroAddr = CGF.CreateMemTemp(
2472	CGF.getContext().getIntTypeForBitwidth(/DestWidth=/32, /Signed=/1),
2473	/Name/ ".zero.addr");
2474	CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/C/ 0));
2475	llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2476	OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
2477	OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2478	OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2479	emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2480	}
2481
2482	void CGOpenMPRuntimeNVPTX::emitParallelCall(
2483	CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
2484	ArrayRef<llvm::Value > CapturedVars, const Expr IfCond) {
2485	if (!CGF.HaveInsertPoint())
2486	return;
2487
2488	if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
2489	emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2490	else
2491	emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2492	}
2493
2494	void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
2495	CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2496	ArrayRef<llvm::Value > CapturedVars, const Expr IfCond) {
2497	llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
2498
2499	// Force inline this outlined function at its call site.
2500	Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2501
2502	Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
2503	/DestWidth=/32, /Signed=/1),
2504	".zero.addr");
2505	CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/C/ 0));
2506	// ThreadId for serialized parallels is 0.
2507	Address ThreadIDAddr = ZeroAddr;
2508	auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, &ThreadIDAddr](
2509	CodeGenFunction &CGF, PrePostActionTy &Action) {
2510	Action.Enter(CGF);
2511
2512	llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2513	OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2514	OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2515	OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2516	emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
2517	};
2518	auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2519	PrePostActionTy &) {
2520
2521	RegionCodeGenTy RCG(CodeGen);
2522	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2523	llvm::Value *ThreadID = getThreadID(CGF, Loc);
2524	llvm::Value *Args[] = {RTLoc, ThreadID};
2525
2526	NVPTXActionTy Action(
2527	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2528	Args,
2529	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2530	Args);
2531	RCG.setAction(Action);
2532	RCG(CGF);
2533	};
2534
2535	auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
2536	PrePostActionTy &Action) {
2537	CGBuilderTy &Bld = CGF.Builder;
2538	llvm::Function *WFn = WrapperFunctionsMap[Fn];
2539	(0) . __assert_fail ("WFn && \"Wrapper function does not exist!\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2539, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(WFn && "Wrapper function does not exist!");
2540	llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2541
2542	// Prepare for parallel region. Indicate the outlined function.
2543	llvm::Value Args[] = {ID, /RequiresOMPRuntime=*/Bld.getInt16(1)};
2544	CGF.EmitRuntimeCall(
2545	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
2546	Args);
2547
2548	// Create a private scope that will globalize the arguments
2549	// passed from the outside of the target region.
2550	CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2551
2552	// There's something to share.
2553	if (!CapturedVars.empty()) {
2554	// Prepare for parallel region. Indicate the outlined function.
2555	Address SharedArgs =
2556	CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
2557	llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
2558
2559	llvm::Value *DataSharingArgs[] = {
2560	SharedArgsPtr,
2561	llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2562	CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2563	OMPRTL_NVPTX__kmpc_begin_sharing_variables),
2564	DataSharingArgs);
2565
2566	// Store variable address in a list of references to pass to workers.
2567	unsigned Idx = 0;
2568	ASTContext &Ctx = CGF.getContext();
2569	Address SharedArgListAddress = CGF.EmitLoadOfPointer(
2570	SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
2571	.castAs<PointerType>());
2572	for (llvm::Value *V : CapturedVars) {
2573	Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
2574	llvm::Value *PtrV;
2575	if (V->getType()->isIntegerTy())
2576	PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2577	else
2578	PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
2579	CGF.EmitStoreOfScalar(PtrV, Dst, /Volatile=/false,
2580	Ctx.getPointerType(Ctx.VoidPtrTy));
2581	++Idx;
2582	}
2583	}
2584
2585	// Activate workers. This barrier is used by the master to signal
2586	// work for the workers.
2587	syncCTAThreads(CGF);
2588
2589	// OpenMP [2.5, Parallel Construct, p.49]
2590	// There is an implied barrier at the end of a parallel region. After the
2591	// end of a parallel region, only the master thread of the team resumes
2592	// execution of the enclosing task region.
2593	//
2594	// The master waits at this barrier until all workers are done.
2595	syncCTAThreads(CGF);
2596
2597	if (!CapturedVars.empty())
2598	CGF.EmitRuntimeCall(
2599	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
2600
2601	// Remember for post-processing in worker loop.
2602	Work.emplace_back(WFn);
2603	};
2604
2605	auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen](
2606	CodeGenFunction &CGF, PrePostActionTy &Action) {
2607	if (IsInParallelRegion) {
2608	SeqGen(CGF, Action);
2609	} else if (IsInTargetMasterThreadRegion) {
2610	L0ParallelGen(CGF, Action);
2611	} else {
2612	// Check for master and then parallelism:
2613	// if (__kmpc_is_spmd_exec_mode() \|\| __kmpc_parallel_level(loc, gtid)) {
2614	// Serialized execution.
2615	// } else {
2616	// Worker call.
2617	// }
2618	CGBuilderTy &Bld = CGF.Builder;
2619	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2620	llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
2621	llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
2622	llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
2623	llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2624	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2625	Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB);
2626	// There is no need to emit line number for unconditional branch.
2627	(void)ApplyDebugLocation::CreateEmpty(CGF);
2628	CGF.EmitBlock(ParallelCheckBB);
2629	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2630	llvm::Value *ThreadID = getThreadID(CGF, Loc);
2631	llvm::Value *PL = CGF.EmitRuntimeCall(
2632	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2633	{RTLoc, ThreadID});
2634	llvm::Value *Res = Bld.CreateIsNotNull(PL);
2635	Bld.CreateCondBr(Res, SeqBB, MasterBB);
2636	CGF.EmitBlock(SeqBB);
2637	SeqGen(CGF, Action);
2638	CGF.EmitBranch(ExitBB);
2639	// There is no need to emit line number for unconditional branch.
2640	(void)ApplyDebugLocation::CreateEmpty(CGF);
2641	CGF.EmitBlock(MasterBB);
2642	L0ParallelGen(CGF, Action);
2643	CGF.EmitBranch(ExitBB);
2644	// There is no need to emit line number for unconditional branch.
2645	(void)ApplyDebugLocation::CreateEmpty(CGF);
2646	// Emit the continuation block for code after the if.
2647	CGF.EmitBlock(ExitBB, /IsFinished=/true);
2648	}
2649	};
2650
2651	if (IfCond) {
2652	emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
2653	} else {
2654	CodeGenFunction::RunCleanupsScope Scope(CGF);
2655	RegionCodeGenTy ThenRCG(LNParallelGen);
2656	ThenRCG(CGF);
2657	}
2658	}
2659
2660	void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
2661	CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
2662	ArrayRef<llvm::Value > CapturedVars, const Expr IfCond) {
2663	// Just call the outlined function to execute the parallel region.
2664	// OutlinedFn(&GTid, &zero, CapturedStruct);
2665	//
2666	llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2667
2668	Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
2669	/DestWidth=/32, /Signed=/1),
2670	".zero.addr");
2671	CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/C/ 0));
2672	// ThreadId for serialized parallels is 0.
2673	Address ThreadIDAddr = ZeroAddr;
2674	auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
2675	&ThreadIDAddr](CodeGenFunction &CGF,
2676	PrePostActionTy &Action) {
2677	Action.Enter(CGF);
2678
2679	llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2680	OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2681	OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2682	OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2683	emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2684	};
2685	auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2686	PrePostActionTy &) {
2687
2688	RegionCodeGenTy RCG(CodeGen);
2689	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2690	llvm::Value *ThreadID = getThreadID(CGF, Loc);
2691	llvm::Value *Args[] = {RTLoc, ThreadID};
2692
2693	NVPTXActionTy Action(
2694	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2695	Args,
2696	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2697	Args);
2698	RCG.setAction(Action);
2699	RCG(CGF);
2700	};
2701
2702	if (IsInTargetMasterThreadRegion) {
2703	// In the worker need to use the real thread id.
2704	ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
2705	RegionCodeGenTy RCG(CodeGen);
2706	RCG(CGF);
2707	} else {
2708	// If we are not in the target region, it is definitely L2 parallelism or
2709	// more, because for SPMD mode we always has L1 parallel level, sowe don't
2710	// need to check for orphaned directives.
2711	RegionCodeGenTy RCG(SeqGen);
2712	RCG(CGF);
2713	}
2714	}
2715
2716	void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
2717	// Always emit simple barriers!
2718	if (!CGF.HaveInsertPoint())
2719	return;
2720	// Build call __kmpc_barrier_simple_spmd(nullptr, 0);
2721	// This function does not use parameters, so we can emit just default values.
2722	llvm::Value *Args[] = {
2723	llvm::ConstantPointerNull::get(
2724	cast<llvm::PointerType>(getIdentTyPointerTy())),
2725	llvm::ConstantInt::get(CGF.Int32Ty, /V=/0, /isSigned=/true)};
2726	CGF.EmitRuntimeCall(
2727	createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args);
2728	}
2729
2730	void CGOpenMPRuntimeNVPTX::emitBarrierCall(CodeGenFunction &CGF,
2731	SourceLocation Loc,
2732	OpenMPDirectiveKind Kind, bool,
2733	bool) {
2734	// Always emit simple barriers!
2735	if (!CGF.HaveInsertPoint())
2736	return;
2737	// Build call __kmpc_cancel_barrier(loc, thread_id);
2738	unsigned Flags = getDefaultFlagsForBarriers(Kind);
2739	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
2740	getThreadID(CGF, Loc)};
2741	CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier), Args);
2742	}
2743
2744	void CGOpenMPRuntimeNVPTX::emitCriticalRegion(
2745	CodeGenFunction &CGF, StringRef CriticalName,
2746	const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2747	const Expr *Hint) {
2748	llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2749	llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2750	llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2751	llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2752	llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2753
2754	// Fetch team-local id of the thread.
2755	llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2756
2757	// Get the width of the team.
2758	llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
2759
2760	// Initialize the counter variable for the loop.
2761	QualType Int32Ty =
2762	CGF.getContext().getIntTypeForBitwidth(/DestWidth=/32, /Signed=/0);
2763	Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2764	LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2765	CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2766	/isInit=/true);
2767
2768	// Block checks if loop counter exceeds upper bound.
2769	CGF.EmitBlock(LoopBB);
2770	llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2771	llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2772	CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2773
2774	// Block tests which single thread should execute region, and which threads
2775	// should go straight to synchronisation point.
2776	CGF.EmitBlock(TestBB);
2777	CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2778	llvm::Value *CmpThreadToCounter =
2779	CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2780	CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2781
2782	// Block emits the body of the critical region.
2783	CGF.EmitBlock(BodyBB);
2784
2785	// Output the critical statement.
2786	CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
2787	Hint);
2788
2789	// After the body surrounded by the critical region, the single executing
2790	// thread will jump to the synchronisation point.
2791	// Block waits for all threads in current team to finish then increments the
2792	// counter variable and returns to the loop.
2793	CGF.EmitBlock(SyncBB);
2794	emitBarrierCall(CGF, Loc, OMPD_unknown, /EmitChecks=/false,
2795	/ForceSimpleCall=/true);
2796
2797	llvm::Value *IncCounterVal =
2798	CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2799	CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2800	CGF.EmitBranch(LoopBB);
2801
2802	// Block that is reached when all threads in the team complete the region.
2803	CGF.EmitBlock(ExitBB, /IsFinished=/true);
2804	}
2805
2806	/// Cast value to the specified type.
2807	static llvm::Value castValueToType(CodeGenFunction &CGF, llvm::Value Val,
2808	QualType ValTy, QualType CastTy,
2809	SourceLocation Loc) {
2810	(0) . __assert_fail ("!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && \"Cast type must sized.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2811, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2811	(0) . __assert_fail ("!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && \"Cast type must sized.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2811, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Cast type must sized.");
2812	(0) . __assert_fail ("!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && \"Val type must sized.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2813, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2813	(0) . __assert_fail ("!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && \"Val type must sized.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2813, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Val type must sized.");
2814	llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2815	if (ValTy == CastTy)
2816	return Val;
2817	if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2818	CGF.getContext().getTypeSizeInChars(CastTy))
2819	return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2820	if (CastTy->isIntegerType() && ValTy->isIntegerType())
2821	return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2822	CastTy->hasSignedIntegerRepresentation());
2823	Address CastItem = CGF.CreateMemTemp(CastTy);
2824	Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2825	CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2826	CGF.EmitStoreOfScalar(Val, ValCastItem, /Volatile=/false, ValTy);
2827	return CGF.EmitLoadOfScalar(CastItem, /Volatile=/false, CastTy, Loc);
2828	}
2829
2830	/// This function creates calls to one of two shuffle functions to copy
2831	/// variables between lanes in a warp.
2832	static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
2833	llvm::Value *Elem,
2834	QualType ElemType,
2835	llvm::Value *Offset,
2836	SourceLocation Loc) {
2837	CodeGenModule &CGM = CGF.CGM;
2838	CGBuilderTy &Bld = CGF.Builder;
2839	CGOpenMPRuntimeNVPTX &RT =
2840	(static_cast<CGOpenMPRuntimeNVPTX >(&CGM.getOpenMPRuntime()));
2841
2842	CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2843	(0) . __assert_fail ("Size.getQuantity() <= 8 && \"Unsupported bitwidth in shuffle instruction.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2844, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(Size.getQuantity() <= 8 &&
2844	(0) . __assert_fail ("Size.getQuantity() <= 8 && \"Unsupported bitwidth in shuffle instruction.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2844, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Unsupported bitwidth in shuffle instruction.");
2845
2846	OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
2847	? OMPRTL_NVPTX__kmpc_shuffle_int32
2848	: OMPRTL_NVPTX__kmpc_shuffle_int64;
2849
2850	// Cast all types to 32- or 64-bit values before calling shuffle routines.
2851	QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2852	Size.getQuantity() <= 4 ? 32 : 64, /Signed=/1);
2853	llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
2854	llvm::Value *WarpSize =
2855	Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /isSigned=/true);
2856
2857	llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2858	RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize});
2859
2860	return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
2861	}
2862
2863	static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2864	Address DestAddr, QualType ElemType,
2865	llvm::Value *Offset, SourceLocation Loc) {
2866	CGBuilderTy &Bld = CGF.Builder;
2867
2868	CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2869	// Create the loop over the big sized data.
2870	// ptr = (void*)Elem;
2871	// ptrEnd = (void*) Elem + 1;
2872	// Step = 8;
2873	// while (ptr + Step < ptrEnd)
2874	// shuffle((int64_t)*ptr);
2875	// Step = 4;
2876	// while (ptr + Step < ptrEnd)
2877	// shuffle((int32_t)*ptr);
2878	// ...
2879	Address ElemPtr = DestAddr;
2880	Address Ptr = SrcAddr;
2881	Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
2882	Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
2883	for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2884	if (Size < CharUnits::fromQuantity(IntSize))
2885	continue;
2886	QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2887	CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2888	/Signed=/1);
2889	llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2890	Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2891	ElemPtr =
2892	Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2893	if (Size.getQuantity() / IntSize > 1) {
2894	llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2895	llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2896	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2897	llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2898	CGF.EmitBlock(PreCondBB);
2899	llvm::PHINode *PhiSrc =
2900	Bld.CreatePHI(Ptr.getType(), /NumReservedValues=/2);
2901	PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2902	llvm::PHINode *PhiDest =
2903	Bld.CreatePHI(ElemPtr.getType(), /NumReservedValues=/2);
2904	PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2905	Ptr = Address(PhiSrc, Ptr.getAlignment());
2906	ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2907	llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2908	PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
2909	Ptr.getPointer(), CGF.VoidPtrTy));
2910	Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2911	ThenBB, ExitBB);
2912	CGF.EmitBlock(ThenBB);
2913	llvm::Value *Res = createRuntimeShuffleFunction(
2914	CGF, CGF.EmitLoadOfScalar(Ptr, /Volatile=/false, IntType, Loc),
2915	IntType, Offset, Loc);
2916	CGF.EmitStoreOfScalar(Res, ElemPtr, /Volatile=/false, IntType);
2917	Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
2918	Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2919	PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
2920	PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
2921	CGF.EmitBranch(PreCondBB);
2922	CGF.EmitBlock(ExitBB);
2923	} else {
2924	llvm::Value *Res = createRuntimeShuffleFunction(
2925	CGF, CGF.EmitLoadOfScalar(Ptr, /Volatile=/false, IntType, Loc),
2926	IntType, Offset, Loc);
2927	CGF.EmitStoreOfScalar(Res, ElemPtr, /Volatile=/false, IntType);
2928	Ptr = Bld.CreateConstGEP(Ptr, 1);
2929	ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2930	}
2931	Size = Size % IntSize;
2932	}
2933	}
2934
2935	namespace {
2936	enum CopyAction : unsigned {
2937	// RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2938	// the warp using shuffle instructions.
2939	RemoteLaneToThread,
2940	// ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2941	ThreadCopy,
2942	// ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2943	ThreadToScratchpad,
2944	// ScratchpadToThread: Copy from a scratchpad array in global memory
2945	// containing team-reduced data to a thread's stack.
2946	ScratchpadToThread,
2947	};
2948	} // namespace
2949
2950	struct CopyOptionsTy {
2951	llvm::Value *RemoteLaneOffset;
2952	llvm::Value *ScratchpadIndex;
2953	llvm::Value *ScratchpadWidth;
2954	};
2955
2956	/// Emit instructions to copy a Reduce list, which contains partially
2957	/// aggregated values, in the specified direction.
2958	static void emitReductionListCopy(
2959	CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2960	ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2961	CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
2962
2963	CodeGenModule &CGM = CGF.CGM;
2964	ASTContext &C = CGM.getContext();
2965	CGBuilderTy &Bld = CGF.Builder;
2966
2967	llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2968	llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2969	llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2970
2971	// Iterates, element-by-element, through the source Reduce list and
2972	// make a copy.
2973	unsigned Idx = 0;
2974	unsigned Size = Privates.size();
2975	for (const Expr *Private : Privates) {
2976	Address SrcElementAddr = Address::invalid();
2977	Address DestElementAddr = Address::invalid();
2978	Address DestElementPtrAddr = Address::invalid();
2979	// Should we shuffle in an element from a remote lane?
2980	bool ShuffleInElement = false;
2981	// Set to true to update the pointer in the dest Reduce list to a
2982	// newly created element.
2983	bool UpdateDestListPtr = false;
2984	// Increment the src or dest pointer to the scratchpad, for each
2985	// new element.
2986	bool IncrScratchpadSrc = false;
2987	bool IncrScratchpadDest = false;
2988
2989	switch (Action) {
2990	case RemoteLaneToThread: {
2991	// Step 1.1: Get the address for the src element in the Reduce list.
2992	Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2993	SrcElementAddr = CGF.EmitLoadOfPointer(
2994	SrcElementPtrAddr,
2995	C.getPointerType(Private->getType())->castAs<PointerType>());
2996
2997	// Step 1.2: Create a temporary to store the element in the destination
2998	// Reduce list.
2999	DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3000	DestElementAddr =
3001	CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
3002	ShuffleInElement = true;
3003	UpdateDestListPtr = true;
3004	break;
3005	}
3006	case ThreadCopy: {
3007	// Step 1.1: Get the address for the src element in the Reduce list.
3008	Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
3009	SrcElementAddr = CGF.EmitLoadOfPointer(
3010	SrcElementPtrAddr,
3011	C.getPointerType(Private->getType())->castAs<PointerType>());
3012
3013	// Step 1.2: Get the address for dest element. The destination
3014	// element has already been created on the thread's stack.
3015	DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3016	DestElementAddr = CGF.EmitLoadOfPointer(
3017	DestElementPtrAddr,
3018	C.getPointerType(Private->getType())->castAs<PointerType>());
3019	break;
3020	}
3021	case ThreadToScratchpad: {
3022	// Step 1.1: Get the address for the src element in the Reduce list.
3023	Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
3024	SrcElementAddr = CGF.EmitLoadOfPointer(
3025	SrcElementPtrAddr,
3026	C.getPointerType(Private->getType())->castAs<PointerType>());
3027
3028	// Step 1.2: Get the address for dest element:
3029	// address = base + index * ElementSizeInChars.
3030	llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3031	llvm::Value *CurrentOffset =
3032	Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
3033	llvm::Value *ScratchPadElemAbsolutePtrVal =
3034	Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
3035	ScratchPadElemAbsolutePtrVal =
3036	Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
3037	DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
3038	C.getTypeAlignInChars(Private->getType()));
3039	IncrScratchpadDest = true;
3040	break;
3041	}
3042	case ScratchpadToThread: {
3043	// Step 1.1: Get the address for the src element in the scratchpad.
3044	// address = base + index * ElementSizeInChars.
3045	llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3046	llvm::Value *CurrentOffset =
3047	Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
3048	llvm::Value *ScratchPadElemAbsolutePtrVal =
3049	Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
3050	ScratchPadElemAbsolutePtrVal =
3051	Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
3052	SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
3053	C.getTypeAlignInChars(Private->getType()));
3054	IncrScratchpadSrc = true;
3055
3056	// Step 1.2: Create a temporary to store the element in the destination
3057	// Reduce list.
3058	DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3059	DestElementAddr =
3060	CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
3061	UpdateDestListPtr = true;
3062	break;
3063	}
3064	}
3065
3066	// Regardless of src and dest of copy, we emit the load of src
3067	// element as this is required in all directions
3068	SrcElementAddr = Bld.CreateElementBitCast(
3069	SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
3070	DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
3071	SrcElementAddr.getElementType());
3072
3073	// Now that all active lanes have read the element in the
3074	// Reduce list, shuffle over the value from the remote lane.
3075	if (ShuffleInElement) {
3076	shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
3077	RemoteLaneOffset, Private->getExprLoc());
3078	} else {
3079	switch (CGF.getEvaluationKind(Private->getType())) {
3080	case TEK_Scalar: {
3081	llvm::Value *Elem =
3082	CGF.EmitLoadOfScalar(SrcElementAddr, /Volatile=/false,
3083	Private->getType(), Private->getExprLoc());
3084	// Store the source element value to the dest element address.
3085	CGF.EmitStoreOfScalar(Elem, DestElementAddr, /Volatile=/false,
3086	Private->getType());
3087	break;
3088	}
3089	case TEK_Complex: {
3090	CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
3091	CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3092	Private->getExprLoc());
3093	CGF.EmitStoreOfComplex(
3094	Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3095	/isInit=/false);
3096	break;
3097	}
3098	case TEK_Aggregate:
3099	CGF.EmitAggregateCopy(
3100	CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3101	CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3102	Private->getType(), AggValueSlot::DoesNotOverlap);
3103	break;
3104	}
3105	}
3106
3107	// Step 3.1: Modify reference in dest Reduce list as needed.
3108	// Modifying the reference in Reduce list to point to the newly
3109	// created element. The element is live in the current function
3110	// scope and that of functions it invokes (i.e., reduce_function).
3111	// RemoteReduceData[i] = (void*)&RemoteElem
3112	if (UpdateDestListPtr) {
3113	CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
3114	DestElementAddr.getPointer(), CGF.VoidPtrTy),
3115	DestElementPtrAddr, /Volatile=/false,
3116	C.VoidPtrTy);
3117	}
3118
3119	// Step 4.1: Increment SrcBase/DestBase so that it points to the starting
3120	// address of the next element in scratchpad memory, unless we're currently
3121	// processing the last one. Memory alignment is also taken care of here.
3122	if ((IncrScratchpadDest \|\| IncrScratchpadSrc) && (Idx + 1 < Size)) {
3123	llvm::Value *ScratchpadBasePtr =
3124	IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
3125	llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3126	ScratchpadBasePtr = Bld.CreateNUWAdd(
3127	ScratchpadBasePtr,
3128	Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
3129
3130	// Take care of global memory alignment for performance
3131	ScratchpadBasePtr = Bld.CreateNUWSub(
3132	ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3133	ScratchpadBasePtr = Bld.CreateUDiv(
3134	ScratchpadBasePtr,
3135	llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3136	ScratchpadBasePtr = Bld.CreateNUWAdd(
3137	ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3138	ScratchpadBasePtr = Bld.CreateNUWMul(
3139	ScratchpadBasePtr,
3140	llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3141
3142	if (IncrScratchpadDest)
3143	DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3144	else /* IncrScratchpadSrc = true */
3145	SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3146	}
3147
3148	++Idx;
3149	}
3150	}
3151
3152	/// This function emits a helper that gathers Reduce lists from the first
3153	/// lane of every active warp to lanes in the first warp.
3154	///
3155	/// void inter_warp_copy_func(void* reduce_data, num_warps)
3156	/// shared smem[warp_size];
3157	/// For all data entries D in reduce_data:
3158	/// sync
3159	/// If (I am the first lane in each warp)
3160	/// Copy my local D to smem[warp_id]
3161	/// sync
3162	/// if (I am the first warp)
3163	/// Copy smem[thread_id] to my local D
3164	static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
3165	ArrayRef<const Expr *> Privates,
3166	QualType ReductionArrayTy,
3167	SourceLocation Loc) {
3168	ASTContext &C = CGM.getContext();
3169	llvm::Module &M = CGM.getModule();
3170
3171	// ReduceList: thread local Reduce list.
3172	// At the stage of the computation when this function is called, partially
3173	// aggregated values reside in the first lane of every active warp.
3174	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
3175	C.VoidPtrTy, ImplicitParamDecl::Other);
3176	// NumWarps: number of warps active in the parallel region. This could
3177	// be smaller than 32 (max warps in a CTA) for partial block reduction.
3178	ImplicitParamDecl NumWarpsArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
3179	C.getIntTypeForBitwidth(32, /* Signed */ true),
3180	ImplicitParamDecl::Other);
3181	FunctionArgList Args;
3182	Args.push_back(&ReduceListArg);
3183	Args.push_back(&NumWarpsArg);
3184
3185	const CGFunctionInfo &CGFI =
3186	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3187	auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
3188	llvm::GlobalValue::InternalLinkage,
3189	"_omp_reduction_inter_warp_copy_func", &M);
3190	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3191	Fn->setDoesNotRecurse();
3192	CodeGenFunction CGF(CGM);
3193	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3194
3195	CGBuilderTy &Bld = CGF.Builder;
3196
3197	// This array is used as a medium to transfer, one reduce element at a time,
3198	// the data from the first lane of every warp to lanes in the first warp
3199	// in order to perform the final step of a reduction in a parallel region
3200	// (reduction across warps). The array is placed in NVPTX __shared__ memory
3201	// for reduced latency, as well as to have a distinct copy for concurrently
3202	// executing target regions. The array is declared with common linkage so
3203	// as to be shared across compilation units.
3204	StringRef TransferMediumName =
3205	"__openmp_nvptx_data_transfer_temporary_storage";
3206	llvm::GlobalVariable *TransferMedium =
3207	M.getGlobalVariable(TransferMediumName);
3208	if (!TransferMedium) {
3209	auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
3210	unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
3211	TransferMedium = new llvm::GlobalVariable(
3212	M, Ty, /isConstant=/false, llvm::GlobalVariable::CommonLinkage,
3213	llvm::Constant::getNullValue(Ty), TransferMediumName,
3214	/InsertBefore=/nullptr, llvm::GlobalVariable::NotThreadLocal,
3215	SharedAddressSpace);
3216	CGM.addCompilerUsedGlobal(TransferMedium);
3217	}
3218
3219	// Get the CUDA thread id of the current OpenMP thread on the GPU.
3220	llvm::Value *ThreadID = getNVPTXThreadID(CGF);
3221	// nvptx_lane_id = nvptx_id % warpsize
3222	llvm::Value *LaneID = getNVPTXLaneID(CGF);
3223	// nvptx_warp_id = nvptx_id / warpsize
3224	llvm::Value *WarpID = getNVPTXWarpID(CGF);
3225
3226	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3227	Address LocalReduceList(
3228	Bld.CreatePointerBitCastOrAddrSpaceCast(
3229	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
3230	C.VoidPtrTy, Loc),
3231	CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3232	CGF.getPointerAlign());
3233
3234	unsigned Idx = 0;
3235	for (const Expr *Private : Privates) {
3236	//
3237	// Warp master copies reduce element to transfer medium in __shared__
3238	// memory.
3239	//
3240	unsigned RealTySize =
3241	C.getTypeSizeInChars(Private->getType())
3242	.alignTo(C.getTypeAlignInChars(Private->getType()))
3243	.getQuantity();
3244	for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
3245	unsigned NumIters = RealTySize / TySize;
3246	if (NumIters == 0)
3247	continue;
3248	QualType CType = C.getIntTypeForBitwidth(
3249	C.toBits(CharUnits::fromQuantity(TySize)), /Signed=/1);
3250	llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
3251	CharUnits Align = CharUnits::fromQuantity(TySize);
3252	llvm::Value *Cnt = nullptr;
3253	Address CntAddr = Address::invalid();
3254	llvm::BasicBlock *PrecondBB = nullptr;
3255	llvm::BasicBlock *ExitBB = nullptr;
3256	if (NumIters > 1) {
3257	CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
3258	CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
3259	/Volatile=/false, C.IntTy);
3260	PrecondBB = CGF.createBasicBlock("precond");
3261	ExitBB = CGF.createBasicBlock("exit");
3262	llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
3263	// There is no need to emit line number for unconditional branch.
3264	(void)ApplyDebugLocation::CreateEmpty(CGF);
3265	CGF.EmitBlock(PrecondBB);
3266	Cnt = CGF.EmitLoadOfScalar(CntAddr, /Volatile=/false, C.IntTy, Loc);
3267	llvm::Value *Cmp =
3268	Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
3269	Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
3270	CGF.EmitBlock(BodyBB);
3271	}
3272	// kmpc_barrier.
3273	CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3274	/EmitChecks=/false,
3275	/ForceSimpleCall=/true);
3276	llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3277	llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3278	llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3279
3280	// if (lane_id == 0)
3281	llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
3282	Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3283	CGF.EmitBlock(ThenBB);
3284
3285	// Reduce element = LocalReduceList[i]
3286	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3287	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3288	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation());
3289	// elemptr = ((CopyType*)(elemptrptr)) + I
3290	Address ElemPtr = Address(ElemPtrPtr, Align);
3291	ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
3292	if (NumIters > 1) {
3293	ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
3294	ElemPtr.getAlignment());
3295	}
3296
3297	// Get pointer to location in transfer medium.
3298	// MediumPtr = &medium[warp_id]
3299	llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
3300	TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
3301	Address MediumPtr(MediumPtrVal, Align);
3302	// Casting to actual data type.
3303	// MediumPtr = (CopyType*)MediumPtrAddr;
3304	MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
3305
3306	// elem = *elemptr
3307	//*MediumPtr = elem
3308	llvm::Value *Elem =
3309	CGF.EmitLoadOfScalar(ElemPtr, /Volatile=/false, CType, Loc);
3310	// Store the source element value to the dest element address.
3311	CGF.EmitStoreOfScalar(Elem, MediumPtr, /Volatile=/true, CType);
3312
3313	Bld.CreateBr(MergeBB);
3314
3315	CGF.EmitBlock(ElseBB);
3316	Bld.CreateBr(MergeBB);
3317
3318	CGF.EmitBlock(MergeBB);
3319
3320	// kmpc_barrier.
3321	CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3322	/EmitChecks=/false,
3323	/ForceSimpleCall=/true);
3324
3325	//
3326	// Warp 0 copies reduce element from transfer medium.
3327	//
3328	llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
3329	llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
3330	llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
3331
3332	Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
3333	llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
3334	AddrNumWarpsArg, /Volatile=/false, C.IntTy, Loc);
3335
3336	// Up to 32 threads in warp 0 are active.
3337	llvm::Value *IsActiveThread =
3338	Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
3339	Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3340
3341	CGF.EmitBlock(W0ThenBB);
3342
3343	// SrcMediumPtr = &medium[tid]
3344	llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
3345	TransferMedium,
3346	{llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
3347	Address SrcMediumPtr(SrcMediumPtrVal, Align);
3348	// SrcMediumVal = *SrcMediumPtr;
3349	SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
3350
3351	// TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3352	Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3353	llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
3354	TargetElemPtrPtr, /Volatile=/false, C.VoidPtrTy, Loc);
3355	Address TargetElemPtr = Address(TargetElemPtrVal, Align);
3356	TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
3357	if (NumIters > 1) {
3358	TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
3359	TargetElemPtr.getAlignment());
3360	}
3361
3362	// *TargetElemPtr = SrcMediumVal;
3363	llvm::Value *SrcMediumValue =
3364	CGF.EmitLoadOfScalar(SrcMediumPtr, /Volatile=/true, CType, Loc);
3365	CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /Volatile=/false,
3366	CType);
3367	Bld.CreateBr(W0MergeBB);
3368
3369	CGF.EmitBlock(W0ElseBB);
3370	Bld.CreateBr(W0MergeBB);
3371
3372	CGF.EmitBlock(W0MergeBB);
3373
3374	if (NumIters > 1) {
3375	Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /V=/1));
3376	CGF.EmitStoreOfScalar(Cnt, CntAddr, /Volatile=/false, C.IntTy);
3377	CGF.EmitBranch(PrecondBB);
3378	(void)ApplyDebugLocation::CreateEmpty(CGF);
3379	CGF.EmitBlock(ExitBB);
3380	}
3381	RealTySize %= TySize;
3382	}
3383	++Idx;
3384	}
3385
3386	CGF.FinishFunction();
3387	return Fn;
3388	}
3389
3390	/// Emit a helper that reduces data across two OpenMP threads (lanes)
3391	/// in the same warp. It uses shuffle instructions to copy over data from
3392	/// a remote lane's stack. The reduction algorithm performed is specified
3393	/// by the fourth parameter.
3394	///
3395	/// Algorithm Versions.
3396	/// Full Warp Reduce (argument value 0):
3397	/// This algorithm assumes that all 32 lanes are active and gathers
3398	/// data from these 32 lanes, producing a single resultant value.
3399	/// Contiguous Partial Warp Reduce (argument value 1):
3400	/// This algorithm assumes that only a contiguous subset of lanes
3401	/// are active. This happens for the last warp in a parallel region
3402	/// when the user specified num_threads is not an integer multiple of
3403	/// 32. This contiguous subset always starts with the zeroth lane.
3404	/// Partial Warp Reduce (argument value 2):
3405	/// This algorithm gathers data from any number of lanes at any position.
3406	/// All reduced values are stored in the lowest possible lane. The set
3407	/// of problems every algorithm addresses is a super set of those
3408	/// addressable by algorithms with a lower version number. Overhead
3409	/// increases as algorithm version increases.
3410	///
3411	/// Terminology
3412	/// Reduce element:
3413	/// Reduce element refers to the individual data field with primitive
3414	/// data types to be combined and reduced across threads.
3415	/// Reduce list:
3416	/// Reduce list refers to a collection of local, thread-private
3417	/// reduce elements.
3418	/// Remote Reduce list:
3419	/// Remote Reduce list refers to a collection of remote (relative to
3420	/// the current thread) reduce elements.
3421	///
3422	/// We distinguish between three states of threads that are important to
3423	/// the implementation of this function.
3424	/// Alive threads:
3425	/// Threads in a warp executing the SIMT instruction, as distinguished from
3426	/// threads that are inactive due to divergent control flow.
3427	/// Active threads:
3428	/// The minimal set of threads that has to be alive upon entry to this
3429	/// function. The computation is correct iff active threads are alive.
3430	/// Some threads are alive but they are not active because they do not
3431	/// contribute to the computation in any useful manner. Turning them off
3432	/// may introduce control flow overheads without any tangible benefits.
3433	/// Effective threads:
3434	/// In order to comply with the argument requirements of the shuffle
3435	/// function, we must keep all lanes holding data alive. But at most
3436	/// half of them perform value aggregation; we refer to this half of
3437	/// threads as effective. The other half is simply handing off their
3438	/// data.
3439	///
3440	/// Procedure
3441	/// Value shuffle:
3442	/// In this step active threads transfer data from higher lane positions
3443	/// in the warp to lower lane positions, creating Remote Reduce list.
3444	/// Value aggregation:
3445	/// In this step, effective threads combine their thread local Reduce list
3446	/// with Remote Reduce list and store the result in the thread local
3447	/// Reduce list.
3448	/// Value copy:
3449	/// In this step, we deal with the assumption made by algorithm 2
3450	/// (i.e. contiguity assumption). When we have an odd number of lanes
3451	/// active, say 2k+1, only k threads will be effective and therefore k
3452	/// new values will be produced. However, the Reduce list owned by the
3453	/// (2k+1)th thread is ignored in the value aggregation. Therefore
3454	/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
3455	/// that the contiguity assumption still holds.
3456	static llvm::Function *emitShuffleAndReduceFunction(
3457	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3458	QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
3459	ASTContext &C = CGM.getContext();
3460
3461	// Thread local Reduce list used to host the values of data to be reduced.
3462	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
3463	C.VoidPtrTy, ImplicitParamDecl::Other);
3464	// Current lane id; could be logical.
3465	ImplicitParamDecl LaneIDArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.ShortTy,
3466	ImplicitParamDecl::Other);
3467	// Offset of the remote source lane relative to the current lane.
3468	ImplicitParamDecl RemoteLaneOffsetArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
3469	C.ShortTy, ImplicitParamDecl::Other);
3470	// Algorithm version. This is expected to be known at compile time.
3471	ImplicitParamDecl AlgoVerArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
3472	C.ShortTy, ImplicitParamDecl::Other);
3473	FunctionArgList Args;
3474	Args.push_back(&ReduceListArg);
3475	Args.push_back(&LaneIDArg);
3476	Args.push_back(&RemoteLaneOffsetArg);
3477	Args.push_back(&AlgoVerArg);
3478
3479	const CGFunctionInfo &CGFI =
3480	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3481	auto *Fn = llvm::Function::Create(
3482	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3483	"_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
3484	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3485	Fn->setDoesNotRecurse();
3486	CodeGenFunction CGF(CGM);
3487	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3488
3489	CGBuilderTy &Bld = CGF.Builder;
3490
3491	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3492	Address LocalReduceList(
3493	Bld.CreatePointerBitCastOrAddrSpaceCast(
3494	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
3495	C.VoidPtrTy, SourceLocation()),
3496	CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3497	CGF.getPointerAlign());
3498
3499	Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
3500	llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
3501	AddrLaneIDArg, /Volatile=/false, C.ShortTy, SourceLocation());
3502
3503	Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
3504	llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
3505	AddrRemoteLaneOffsetArg, /Volatile=/false, C.ShortTy, SourceLocation());
3506
3507	Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
3508	llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
3509	AddrAlgoVerArg, /Volatile=/false, C.ShortTy, SourceLocation());
3510
3511	// Create a local thread-private variable to host the Reduce list
3512	// from a remote lane.
3513	Address RemoteReduceList =
3514	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
3515
3516	// This loop iterates through the list of reduce elements and copies,
3517	// element by element, from a remote lane in the warp to RemoteReduceList,
3518	// hosted on the thread's stack.
3519	emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3520	LocalReduceList, RemoteReduceList,
3521	{/RemoteLaneOffset=/RemoteLaneOffsetArgVal,
3522	/ScratchpadIndex=/nullptr,
3523	/ScratchpadWidth=/nullptr});
3524
3525	// The actions to be performed on the Remote Reduce list is dependent
3526	// on the algorithm version.
3527	//
3528	// if (AlgoVer==0) \|\| (AlgoVer==1 && (LaneId < Offset)) \|\| (AlgoVer==2 &&
3529	// LaneId % 2 == 0 && Offset > 0):
3530	// do the reduction value aggregation
3531	//
3532	// The thread local variable Reduce list is mutated in place to host the
3533	// reduced data, which is the aggregated value produced from local and
3534	// remote lanes.
3535	//
3536	// Note that AlgoVer is expected to be a constant integer known at compile
3537	// time.
3538	// When AlgoVer==0, the first conjunction evaluates to true, making
3539	// the entire predicate true during compile time.
3540	// When AlgoVer==1, the second conjunction has only the second part to be
3541	// evaluated during runtime. Other conjunctions evaluates to false
3542	// during compile time.
3543	// When AlgoVer==2, the third conjunction has only the second part to be
3544	// evaluated during runtime. Other conjunctions evaluates to false
3545	// during compile time.
3546	llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
3547
3548	llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3549	llvm::Value *CondAlgo1 = Bld.CreateAnd(
3550	Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3551
3552	llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3553	llvm::Value *CondAlgo2 = Bld.CreateAnd(
3554	Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
3555	CondAlgo2 = Bld.CreateAnd(
3556	CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3557
3558	llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
3559	CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3560
3561	llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3562	llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3563	llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3564	Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3565
3566	CGF.EmitBlock(ThenBB);
3567	// reduce_function(LocalReduceList, RemoteReduceList)
3568	llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3569	LocalReduceList.getPointer(), CGF.VoidPtrTy);
3570	llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3571	RemoteReduceList.getPointer(), CGF.VoidPtrTy);
3572	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3573	CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
3574	Bld.CreateBr(MergeBB);
3575
3576	CGF.EmitBlock(ElseBB);
3577	Bld.CreateBr(MergeBB);
3578
3579	CGF.EmitBlock(MergeBB);
3580
3581	// if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3582	// Reduce list.
3583	Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3584	llvm::Value *CondCopy = Bld.CreateAnd(
3585	Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3586
3587	llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3588	llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3589	llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3590	Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3591
3592	CGF.EmitBlock(CpyThenBB);
3593	emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3594	RemoteReduceList, LocalReduceList);
3595	Bld.CreateBr(CpyMergeBB);
3596
3597	CGF.EmitBlock(CpyElseBB);
3598	Bld.CreateBr(CpyMergeBB);
3599
3600	CGF.EmitBlock(CpyMergeBB);
3601
3602	CGF.FinishFunction();
3603	return Fn;
3604	}
3605
3606	/// This function emits a helper that copies all the reduction variables from
3607	/// the team into the provided global buffer for the reduction variables.
3608	///
3609	/// void list_to_global_copy_func(void buffer, int Idx, void reduce_data)
3610	/// For all data entries D in reduce_data:
3611	/// Copy local D to buffer.D[Idx]
3612	static llvm::Value *emitListToGlobalCopyFunction(
3613	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3614	QualType ReductionArrayTy, SourceLocation Loc,
3615	const RecordDecl *TeamReductionRec,
3616	const llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
3617	&VarFieldMap) {
3618	ASTContext &C = CGM.getContext();
3619
3620	// Buffer: global reduction buffer.
3621	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
3622	C.VoidPtrTy, ImplicitParamDecl::Other);
3623	// Idx: index of the buffer.
3624	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
3625	ImplicitParamDecl::Other);
3626	// ReduceList: thread local Reduce list.
3627	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
3628	C.VoidPtrTy, ImplicitParamDecl::Other);
3629	FunctionArgList Args;
3630	Args.push_back(&BufferArg);
3631	Args.push_back(&IdxArg);
3632	Args.push_back(&ReduceListArg);
3633
3634	const CGFunctionInfo &CGFI =
3635	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3636	auto *Fn = llvm::Function::Create(
3637	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3638	"_omp_reduction_list_to_global_copy_func", &CGM.getModule());
3639	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3640	Fn->setDoesNotRecurse();
3641	CodeGenFunction CGF(CGM);
3642	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3643
3644	CGBuilderTy &Bld = CGF.Builder;
3645
3646	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3647	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3648	Address LocalReduceList(
3649	Bld.CreatePointerBitCastOrAddrSpaceCast(
3650	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
3651	C.VoidPtrTy, Loc),
3652	CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3653	CGF.getPointerAlign());
3654	QualType StaticTy = C.getRecordType(TeamReductionRec);
3655	llvm::Type *LLVMReductionsBufferTy =
3656	CGM.getTypes().ConvertTypeForMem(StaticTy);
3657	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3658	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
3659	LLVMReductionsBufferTy->getPointerTo());
3660	llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3661	CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3662	/Volatile=/false, C.IntTy,
3663	Loc)};
3664	unsigned Idx = 0;
3665	for (const Expr *Private : Privates) {
3666	// Reduce element = LocalReduceList[i]
3667	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3668	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3669	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation());
3670	// elemptr = ((CopyType*)(elemptrptr)) + I
3671	ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3672	ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3673	Address ElemPtr =
3674	Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3675	const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3676	// Global = Buffer.VD[Idx];
3677	const FieldDecl *FD = VarFieldMap.lookup(VD);
3678	LValue GlobLVal = CGF.EmitLValueForField(
3679	CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3680	llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3681	GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
3682	switch (CGF.getEvaluationKind(Private->getType())) {
3683	case TEK_Scalar: {
3684	llvm::Value V = CGF.EmitLoadOfScalar(ElemPtr, /Volatile=*/false,
3685	Private->getType(), Loc);
3686	CGF.EmitStoreOfScalar(V, GlobLVal);
3687	break;
3688	}
3689	case TEK_Complex: {
3690	CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
3691	CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
3692	CGF.EmitStoreOfComplex(V, GlobLVal, /isInit=/false);
3693	break;
3694	}
3695	case TEK_Aggregate:
3696	CGF.EmitAggregateCopy(GlobLVal,
3697	CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3698	Private->getType(), AggValueSlot::DoesNotOverlap);
3699	break;
3700	}
3701	++Idx;
3702	}
3703
3704	CGF.FinishFunction();
3705	return Fn;
3706	}
3707
3708	/// This function emits a helper that reduces all the reduction variables from
3709	/// the team into the provided global buffer for the reduction variables.
3710	///
3711	/// void list_to_global_reduce_func(void buffer, int Idx, void reduce_data)
3712	/// void *GlobPtrs[];
3713	/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
3714	/// ...
3715	/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
3716	/// reduce_function(GlobPtrs, reduce_data);
3717	static llvm::Value *emitListToGlobalReduceFunction(
3718	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3719	QualType ReductionArrayTy, SourceLocation Loc,
3720	const RecordDecl *TeamReductionRec,
3721	const llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
3722	&VarFieldMap,
3723	llvm::Function *ReduceFn) {
3724	ASTContext &C = CGM.getContext();
3725
3726	// Buffer: global reduction buffer.
3727	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
3728	C.VoidPtrTy, ImplicitParamDecl::Other);
3729	// Idx: index of the buffer.
3730	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
3731	ImplicitParamDecl::Other);
3732	// ReduceList: thread local Reduce list.
3733	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
3734	C.VoidPtrTy, ImplicitParamDecl::Other);
3735	FunctionArgList Args;
3736	Args.push_back(&BufferArg);
3737	Args.push_back(&IdxArg);
3738	Args.push_back(&ReduceListArg);
3739
3740	const CGFunctionInfo &CGFI =
3741	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3742	auto *Fn = llvm::Function::Create(
3743	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3744	"_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
3745	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3746	Fn->setDoesNotRecurse();
3747	CodeGenFunction CGF(CGM);
3748	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3749
3750	CGBuilderTy &Bld = CGF.Builder;
3751
3752	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3753	QualType StaticTy = C.getRecordType(TeamReductionRec);
3754	llvm::Type *LLVMReductionsBufferTy =
3755	CGM.getTypes().ConvertTypeForMem(StaticTy);
3756	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3757	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
3758	LLVMReductionsBufferTy->getPointerTo());
3759
3760	// 1. Build a list of reduction variables.
3761	// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3762	Address ReductionList =
3763	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3764	auto IPriv = Privates.begin();
3765	llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3766	CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3767	/Volatile=/false, C.IntTy,
3768	Loc)};
3769	unsigned Idx = 0;
3770	for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3771	Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3772	// Global = Buffer.VD[Idx];
3773	const ValueDecl VD = cast<DeclRefExpr>(IPriv)->getDecl();
3774	const FieldDecl *FD = VarFieldMap.lookup(VD);
3775	LValue GlobLVal = CGF.EmitLValueForField(
3776	CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3777	llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3778	llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3779	CGF.EmitStoreOfScalar(Ptr, Elem, /Volatile=/false, C.VoidPtrTy);
3780	if ((*IPriv)->getType()->isVariablyModifiedType()) {
3781	// Store array size.
3782	++Idx;
3783	Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3784	llvm::Value *Size = CGF.Builder.CreateIntCast(
3785	CGF.getVLASize(
3786	CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3787	.NumElts,
3788	CGF.SizeTy, /isSigned=/false);
3789	CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3790	Elem);
3791	}
3792	}
3793
3794	// Call reduce_function(GlobalReduceList, ReduceList)
3795	llvm::Value *GlobalReduceList =
3796	CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3797	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3798	llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3799	AddrReduceListArg, /Volatile=/false, C.VoidPtrTy, Loc);
3800	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3801	CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
3802	CGF.FinishFunction();
3803	return Fn;
3804	}
3805
3806	/// This function emits a helper that copies all the reduction variables from
3807	/// the team into the provided global buffer for the reduction variables.
3808	///
3809	/// void list_to_global_copy_func(void buffer, int Idx, void reduce_data)
3810	/// For all data entries D in reduce_data:
3811	/// Copy buffer.D[Idx] to local D;
3812	static llvm::Value *emitGlobalToListCopyFunction(
3813	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3814	QualType ReductionArrayTy, SourceLocation Loc,
3815	const RecordDecl *TeamReductionRec,
3816	const llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
3817	&VarFieldMap) {
3818	ASTContext &C = CGM.getContext();
3819
3820	// Buffer: global reduction buffer.
3821	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
3822	C.VoidPtrTy, ImplicitParamDecl::Other);
3823	// Idx: index of the buffer.
3824	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
3825	ImplicitParamDecl::Other);
3826	// ReduceList: thread local Reduce list.
3827	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
3828	C.VoidPtrTy, ImplicitParamDecl::Other);
3829	FunctionArgList Args;
3830	Args.push_back(&BufferArg);
3831	Args.push_back(&IdxArg);
3832	Args.push_back(&ReduceListArg);
3833
3834	const CGFunctionInfo &CGFI =
3835	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3836	auto *Fn = llvm::Function::Create(
3837	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3838	"_omp_reduction_global_to_list_copy_func", &CGM.getModule());
3839	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3840	Fn->setDoesNotRecurse();
3841	CodeGenFunction CGF(CGM);
3842	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3843
3844	CGBuilderTy &Bld = CGF.Builder;
3845
3846	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3847	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3848	Address LocalReduceList(
3849	Bld.CreatePointerBitCastOrAddrSpaceCast(
3850	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
3851	C.VoidPtrTy, Loc),
3852	CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3853	CGF.getPointerAlign());
3854	QualType StaticTy = C.getRecordType(TeamReductionRec);
3855	llvm::Type *LLVMReductionsBufferTy =
3856	CGM.getTypes().ConvertTypeForMem(StaticTy);
3857	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3858	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
3859	LLVMReductionsBufferTy->getPointerTo());
3860
3861	llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3862	CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3863	/Volatile=/false, C.IntTy,
3864	Loc)};
3865	unsigned Idx = 0;
3866	for (const Expr *Private : Privates) {
3867	// Reduce element = LocalReduceList[i]
3868	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3869	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3870	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation());
3871	// elemptr = ((CopyType*)(elemptrptr)) + I
3872	ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3873	ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3874	Address ElemPtr =
3875	Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3876	const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3877	// Global = Buffer.VD[Idx];
3878	const FieldDecl *FD = VarFieldMap.lookup(VD);
3879	LValue GlobLVal = CGF.EmitLValueForField(
3880	CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3881	llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3882	GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
3883	switch (CGF.getEvaluationKind(Private->getType())) {
3884	case TEK_Scalar: {
3885	llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
3886	CGF.EmitStoreOfScalar(V, ElemPtr, /Volatile=/false, Private->getType());
3887	break;
3888	}
3889	case TEK_Complex: {
3890	CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
3891	CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3892	/isInit=/false);
3893	break;
3894	}
3895	case TEK_Aggregate:
3896	CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3897	GlobLVal, Private->getType(),
3898	AggValueSlot::DoesNotOverlap);
3899	break;
3900	}
3901	++Idx;
3902	}
3903
3904	CGF.FinishFunction();
3905	return Fn;
3906	}
3907
3908	/// This function emits a helper that reduces all the reduction variables from
3909	/// the team into the provided global buffer for the reduction variables.
3910	///
3911	/// void global_to_list_reduce_func(void buffer, int Idx, void reduce_data)
3912	/// void *GlobPtrs[];
3913	/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
3914	/// ...
3915	/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
3916	/// reduce_function(reduce_data, GlobPtrs);
3917	static llvm::Value *emitGlobalToListReduceFunction(
3918	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3919	QualType ReductionArrayTy, SourceLocation Loc,
3920	const RecordDecl *TeamReductionRec,
3921	const llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
3922	&VarFieldMap,
3923	llvm::Function *ReduceFn) {
3924	ASTContext &C = CGM.getContext();
3925
3926	// Buffer: global reduction buffer.
3927	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
3928	C.VoidPtrTy, ImplicitParamDecl::Other);
3929	// Idx: index of the buffer.
3930	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
3931	ImplicitParamDecl::Other);
3932	// ReduceList: thread local Reduce list.
3933	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
3934	C.VoidPtrTy, ImplicitParamDecl::Other);
3935	FunctionArgList Args;
3936	Args.push_back(&BufferArg);
3937	Args.push_back(&IdxArg);
3938	Args.push_back(&ReduceListArg);
3939
3940	const CGFunctionInfo &CGFI =
3941	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3942	auto *Fn = llvm::Function::Create(
3943	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3944	"_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
3945	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3946	Fn->setDoesNotRecurse();
3947	CodeGenFunction CGF(CGM);
3948	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3949
3950	CGBuilderTy &Bld = CGF.Builder;
3951
3952	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3953	QualType StaticTy = C.getRecordType(TeamReductionRec);
3954	llvm::Type *LLVMReductionsBufferTy =
3955	CGM.getTypes().ConvertTypeForMem(StaticTy);
3956	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3957	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
3958	LLVMReductionsBufferTy->getPointerTo());
3959
3960	// 1. Build a list of reduction variables.
3961	// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3962	Address ReductionList =
3963	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3964	auto IPriv = Privates.begin();
3965	llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3966	CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3967	/Volatile=/false, C.IntTy,
3968	Loc)};
3969	unsigned Idx = 0;
3970	for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3971	Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3972	// Global = Buffer.VD[Idx];
3973	const ValueDecl VD = cast<DeclRefExpr>(IPriv)->getDecl();
3974	const FieldDecl *FD = VarFieldMap.lookup(VD);
3975	LValue GlobLVal = CGF.EmitLValueForField(
3976	CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3977	llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3978	llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3979	CGF.EmitStoreOfScalar(Ptr, Elem, /Volatile=/false, C.VoidPtrTy);
3980	if ((*IPriv)->getType()->isVariablyModifiedType()) {
3981	// Store array size.
3982	++Idx;
3983	Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3984	llvm::Value *Size = CGF.Builder.CreateIntCast(
3985	CGF.getVLASize(
3986	CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3987	.NumElts,
3988	CGF.SizeTy, /isSigned=/false);
3989	CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3990	Elem);
3991	}
3992	}
3993
3994	// Call reduce_function(ReduceList, GlobalReduceList)
3995	llvm::Value *GlobalReduceList =
3996	CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3997	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3998	llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3999	AddrReduceListArg, /Volatile=/false, C.VoidPtrTy, Loc);
4000	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
4001	CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
4002	CGF.FinishFunction();
4003	return Fn;
4004	}
4005
4006	///
4007	/// Design of OpenMP reductions on the GPU
4008	///
4009	/// Consider a typical OpenMP program with one or more reduction
4010	/// clauses:
4011	///
4012	/// float foo;
4013	/// double bar;
4014	/// #pragma omp target teams distribute parallel for \
4015	/// reduction(+:foo) reduction(*:bar)
4016	/// for (int i = 0; i < N; i++) {
4017	/// foo += A[i]; bar *= B[i];
4018	/// }
4019	///
4020	/// where 'foo' and 'bar' are reduced across all OpenMP threads in
4021	/// all teams. In our OpenMP implementation on the NVPTX device an
4022	/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
4023	/// within a team are mapped to CUDA threads within a threadblock.
4024	/// Our goal is to efficiently aggregate values across all OpenMP
4025	/// threads such that:
4026	///
4027	/// - the compiler and runtime are logically concise, and
4028	/// - the reduction is performed efficiently in a hierarchical
4029	/// manner as follows: within OpenMP threads in the same warp,
4030	/// across warps in a threadblock, and finally across teams on
4031	/// the NVPTX device.
4032	///
4033	/// Introduction to Decoupling
4034	///
4035	/// We would like to decouple the compiler and the runtime so that the
4036	/// latter is ignorant of the reduction variables (number, data types)
4037	/// and the reduction operators. This allows a simpler interface
4038	/// and implementation while still attaining good performance.
4039	///
4040	/// Pseudocode for the aforementioned OpenMP program generated by the
4041	/// compiler is as follows:
4042	///
4043	/// 1. Create private copies of reduction variables on each OpenMP
4044	/// thread: 'foo_private', 'bar_private'
4045	/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
4046	/// to it and writes the result in 'foo_private' and 'bar_private'
4047	/// respectively.
4048	/// 3. Call the OpenMP runtime on the GPU to reduce within a team
4049	/// and store the result on the team master:
4050	///
4051	/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
4052	/// reduceData, shuffleReduceFn, interWarpCpyFn)
4053	///
4054	/// where:
4055	/// struct ReduceData {
4056	/// double *foo;
4057	/// double *bar;
4058	/// } reduceData
4059	/// reduceData.foo = &foo_private
4060	/// reduceData.bar = &bar_private
4061	///
4062	/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
4063	/// auxiliary functions generated by the compiler that operate on
4064	/// variables of type 'ReduceData'. They aid the runtime perform
4065	/// algorithmic steps in a data agnostic manner.
4066	///
4067	/// 'shuffleReduceFn' is a pointer to a function that reduces data
4068	/// of type 'ReduceData' across two OpenMP threads (lanes) in the
4069	/// same warp. It takes the following arguments as input:
4070	///
4071	/// a. variable of type 'ReduceData' on the calling lane,
4072	/// b. its lane_id,
4073	/// c. an offset relative to the current lane_id to generate a
4074	/// remote_lane_id. The remote lane contains the second
4075	/// variable of type 'ReduceData' that is to be reduced.
4076	/// d. an algorithm version parameter determining which reduction
4077	/// algorithm to use.
4078	///
4079	/// 'shuffleReduceFn' retrieves data from the remote lane using
4080	/// efficient GPU shuffle intrinsics and reduces, using the
4081	/// algorithm specified by the 4th parameter, the two operands
4082	/// element-wise. The result is written to the first operand.
4083	///
4084	/// Different reduction algorithms are implemented in different
4085	/// runtime functions, all calling 'shuffleReduceFn' to perform
4086	/// the essential reduction step. Therefore, based on the 4th
4087	/// parameter, this function behaves slightly differently to
4088	/// cooperate with the runtime to ensure correctness under
4089	/// different circumstances.
4090	///
4091	/// 'InterWarpCpyFn' is a pointer to a function that transfers
4092	/// reduced variables across warps. It tunnels, through CUDA
4093	/// shared memory, the thread-private data of type 'ReduceData'
4094	/// from lane 0 of each warp to a lane in the first warp.
4095	/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
4096	/// The last team writes the global reduced value to memory.
4097	///
4098	/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
4099	/// reduceData, shuffleReduceFn, interWarpCpyFn,
4100	/// scratchpadCopyFn, loadAndReduceFn)
4101	///
4102	/// 'scratchpadCopyFn' is a helper that stores reduced
4103	/// data from the team master to a scratchpad array in
4104	/// global memory.
4105	///
4106	/// 'loadAndReduceFn' is a helper that loads data from
4107	/// the scratchpad array and reduces it with the input
4108	/// operand.
4109	///
4110	/// These compiler generated functions hide address
4111	/// calculation and alignment information from the runtime.
4112	/// 5. if ret == 1:
4113	/// The team master of the last team stores the reduced
4114	/// result to the globals in memory.
4115	/// foo += reduceData.foo; bar *= reduceData.bar
4116	///
4117	///
4118	/// Warp Reduction Algorithms
4119	///
4120	/// On the warp level, we have three algorithms implemented in the
4121	/// OpenMP runtime depending on the number of active lanes:
4122	///
4123	/// Full Warp Reduction
4124	///
4125	/// The reduce algorithm within a warp where all lanes are active
4126	/// is implemented in the runtime as follows:
4127	///
4128	/// full_warp_reduce(void *reduce_data,
4129	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
4130	/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
4131	/// ShuffleReduceFn(reduce_data, 0, offset, 0);
4132	/// }
4133	///
4134	/// The algorithm completes in log(2, WARPSIZE) steps.
4135	///
4136	/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
4137	/// not used therefore we save instructions by not retrieving lane_id
4138	/// from the corresponding special registers. The 4th parameter, which
4139	/// represents the version of the algorithm being used, is set to 0 to
4140	/// signify full warp reduction.
4141	///
4142	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4143	///
4144	/// #reduce_elem refers to an element in the local lane's data structure
4145	/// #remote_elem is retrieved from a remote lane
4146	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4147	/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
4148	///
4149	/// Contiguous Partial Warp Reduction
4150	///
4151	/// This reduce algorithm is used within a warp where only the first
4152	/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
4153	/// number of OpenMP threads in a parallel region is not a multiple of
4154	/// WARPSIZE. The algorithm is implemented in the runtime as follows:
4155	///
4156	/// void
4157	/// contiguous_partial_reduce(void *reduce_data,
4158	/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
4159	/// int size, int lane_id) {
4160	/// int curr_size;
4161	/// int offset;
4162	/// curr_size = size;
4163	/// mask = curr_size/2;
4164	/// while (offset>0) {
4165	/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
4166	/// curr_size = (curr_size+1)/2;
4167	/// offset = curr_size/2;
4168	/// }
4169	/// }
4170	///
4171	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4172	///
4173	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4174	/// if (lane_id < offset)
4175	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
4176	/// else
4177	/// reduce_elem = remote_elem
4178	///
4179	/// This algorithm assumes that the data to be reduced are located in a
4180	/// contiguous subset of lanes starting from the first. When there is
4181	/// an odd number of active lanes, the data in the last lane is not
4182	/// aggregated with any other lane's dat but is instead copied over.
4183	///
4184	/// Dispersed Partial Warp Reduction
4185	///
4186	/// This algorithm is used within a warp when any discontiguous subset of
4187	/// lanes are active. It is used to implement the reduction operation
4188	/// across lanes in an OpenMP simd region or in a nested parallel region.
4189	///
4190	/// void
4191	/// dispersed_partial_reduce(void *reduce_data,
4192	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
4193	/// int size, remote_id;
4194	/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
4195	/// do {
4196	/// remote_id = next_active_lane_id_right_after_me();
4197	/// # the above function returns 0 of no active lane
4198	/// # is present right after the current lane.
4199	/// size = number_of_active_lanes_in_this_warp();
4200	/// logical_lane_id /= 2;
4201	/// ShuffleReduceFn(reduce_data, logical_lane_id,
4202	/// remote_id-1-threadIdx.x, 2);
4203	/// } while (logical_lane_id % 2 == 0 && size > 1);
4204	/// }
4205	///
4206	/// There is no assumption made about the initial state of the reduction.
4207	/// Any number of lanes (>=1) could be active at any position. The reduction
4208	/// result is returned in the first active lane.
4209	///
4210	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4211	///
4212	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4213	/// if (lane_id % 2 == 0 && offset > 0)
4214	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
4215	/// else
4216	/// reduce_elem = remote_elem
4217	///
4218	///
4219	/// Intra-Team Reduction
4220	///
4221	/// This function, as implemented in the runtime call
4222	/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
4223	/// threads in a team. It first reduces within a warp using the
4224	/// aforementioned algorithms. We then proceed to gather all such
4225	/// reduced values at the first warp.
4226	///
4227	/// The runtime makes use of the function 'InterWarpCpyFn', which copies
4228	/// data from each of the "warp master" (zeroth lane of each warp, where
4229	/// warp-reduced data is held) to the zeroth warp. This step reduces (in
4230	/// a mathematical sense) the problem of reduction across warp masters in
4231	/// a block to the problem of warp reduction.
4232	///
4233	///
4234	/// Inter-Team Reduction
4235	///
4236	/// Once a team has reduced its data to a single value, it is stored in
4237	/// a global scratchpad array. Since each team has a distinct slot, this
4238	/// can be done without locking.
4239	///
4240	/// The last team to write to the scratchpad array proceeds to reduce the
4241	/// scratchpad array. One or more workers in the last team use the helper
4242	/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
4243	/// the k'th worker reduces every k'th element.
4244	///
4245	/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
4246	/// reduce across workers and compute a globally reduced value.
4247	///
4248	void CGOpenMPRuntimeNVPTX::emitReduction(
4249	CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
4250	ArrayRef<const Expr > LHSExprs, ArrayRef<const Expr > RHSExprs,
4251	ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
4252	if (!CGF.HaveInsertPoint())
4253	return;
4254
4255	bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
4256	#ifndef NDEBUG
4257	bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
4258	#endif
4259
4260	if (Options.SimpleReduction) {
4261	(0) . __assert_fail ("!TeamsReduction && !ParallelReduction && \"Invalid reduction selection in emitReduction.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4262, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(!TeamsReduction && !ParallelReduction &&
4262	(0) . __assert_fail ("!TeamsReduction && !ParallelReduction && \"Invalid reduction selection in emitReduction.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4262, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Invalid reduction selection in emitReduction.");
4263	CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
4264	ReductionOps, Options);
4265	return;
4266	}
4267
4268	(0) . __assert_fail ("(TeamsReduction \|\| ParallelReduction) && \"Invalid reduction selection in emitReduction.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4269, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert((TeamsReduction \|\| ParallelReduction) &&
4269	(0) . __assert_fail ("(TeamsReduction \|\| ParallelReduction) && \"Invalid reduction selection in emitReduction.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4269, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Invalid reduction selection in emitReduction.");
4270
4271	// Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4272	// RedList, shuffle_reduce_func, interwarp_copy_func);
4273	// or
4274	// Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4275	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
4276	llvm::Value *ThreadId = getThreadID(CGF, Loc);
4277
4278	llvm::Value *Res;
4279	ASTContext &C = CGM.getContext();
4280	// 1. Build a list of reduction variables.
4281	// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4282	auto Size = RHSExprs.size();
4283	for (const Expr *E : Privates) {
4284	if (E->getType()->isVariablyModifiedType())
4285	// Reserve place for array size.
4286	++Size;
4287	}
4288	llvm::APInt ArraySize(/unsigned int numBits=/32, Size);
4289	QualType ReductionArrayTy =
4290	C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
4291	/IndexTypeQuals=/0);
4292	Address ReductionList =
4293	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
4294	auto IPriv = Privates.begin();
4295	unsigned Idx = 0;
4296	for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
4297	Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
4298	CGF.Builder.CreateStore(
4299	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4300	CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
4301	Elem);
4302	if ((*IPriv)->getType()->isVariablyModifiedType()) {
4303	// Store array size.
4304	++Idx;
4305	Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
4306	llvm::Value *Size = CGF.Builder.CreateIntCast(
4307	CGF.getVLASize(
4308	CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
4309	.NumElts,
4310	CGF.SizeTy, /isSigned=/false);
4311	CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
4312	Elem);
4313	}
4314	}
4315
4316	llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4317	ReductionList.getPointer(), CGF.VoidPtrTy);
4318	llvm::Function *ReductionFn = emitReductionFunction(
4319	Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
4320	LHSExprs, RHSExprs, ReductionOps);
4321	llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
4322	llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
4323	CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
4324	llvm::Value *InterWarpCopyFn =
4325	emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
4326
4327	if (ParallelReduction) {
4328	llvm::Value *Args[] = {RTLoc,
4329	ThreadId,
4330	CGF.Builder.getInt32(RHSExprs.size()),
4331	ReductionArrayTySize,
4332	RL,
4333	ShuffleAndReduceFn,
4334	InterWarpCopyFn};
4335
4336	Res = CGF.EmitRuntimeCall(
4337	createNVPTXRuntimeFunction(
4338	OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2),
4339	Args);
4340	} else {
4341	(0) . __assert_fail ("TeamsReduction && \"expected teams reduction.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4341, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(TeamsReduction && "expected teams reduction.");
4342	llvm::SmallDenseMap<const ValueDecl , const FieldDecl > VarFieldMap;
4343	llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
4344	int Cnt = 0;
4345	for (const Expr *DRE : Privates) {
4346	PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
4347	++Cnt;
4348	}
4349	const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
4350	CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
4351	C.getLangOpts().OpenMPCUDAReductionBufNum);
4352	TeamsReductions.push_back(TeamReductionRec);
4353	if (!KernelTeamsReductionPtr) {
4354	KernelTeamsReductionPtr = new llvm::GlobalVariable(
4355	CGM.getModule(), CGM.VoidPtrTy, /isConstant=/true,
4356	llvm::GlobalValue::InternalLinkage, nullptr,
4357	"_openmp_teams_reductions_buffer_$_$ptr");
4358	}
4359	llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
4360	Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
4361	/Volatile=/false, C.getPointerType(C.VoidPtrTy), Loc);
4362	llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
4363	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
4364	llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
4365	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
4366	ReductionFn);
4367	llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
4368	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
4369	llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
4370	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
4371	ReductionFn);
4372
4373	llvm::Value *Args[] = {
4374	RTLoc,
4375	ThreadId,
4376	GlobalBufferPtr,
4377	CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
4378	RL,
4379	ShuffleAndReduceFn,
4380	InterWarpCopyFn,
4381	GlobalToBufferCpyFn,
4382	GlobalToBufferRedFn,
4383	BufferToGlobalCpyFn,
4384	BufferToGlobalRedFn};
4385
4386	Res = CGF.EmitRuntimeCall(
4387	createNVPTXRuntimeFunction(
4388	OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2),
4389	Args);
4390	}
4391
4392	// 5. Build if (res == 1)
4393	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
4394	llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
4395	llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
4396	Res, llvm::ConstantInt::get(CGM.Int32Ty, /V=/1));
4397	CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4398
4399	// 6. Build then branch: where we have reduced values in the master
4400	// thread in each team.
4401	// __kmpc_end_reduce{_nowait}(<gtid>);
4402	// break;
4403	CGF.EmitBlock(ThenBB);
4404
4405	// Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4406	auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
4407	this](CodeGenFunction &CGF, PrePostActionTy &Action) {
4408	auto IPriv = Privates.begin();
4409	auto ILHS = LHSExprs.begin();
4410	auto IRHS = RHSExprs.begin();
4411	for (const Expr *E : ReductionOps) {
4412	emitSingleReductionCombiner(CGF, E, IPriv, cast<DeclRefExpr>(ILHS),
4413	cast<DeclRefExpr>(*IRHS));
4414	++IPriv;
4415	++ILHS;
4416	++IRHS;
4417	}
4418	};
4419	llvm::Value *EndArgs[] = {ThreadId};
4420	RegionCodeGenTy RCG(CodeGen);
4421	NVPTXActionTy Action(
4422	nullptr, llvm::None,
4423	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
4424	EndArgs);
4425	RCG.setAction(Action);
4426	RCG(CGF);
4427	// There is no need to emit line number for unconditional branch.
4428	(void)ApplyDebugLocation::CreateEmpty(CGF);
4429	CGF.EmitBlock(ExitBB, /IsFinished=/true);
4430	}
4431
4432	const VarDecl *
4433	CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD,
4434	const VarDecl *NativeParam) const {
4435	if (!NativeParam->getType()->isReferenceType())
4436	return NativeParam;
4437	QualType ArgType = NativeParam->getType();
4438	QualifierCollector QC;
4439	const Type *NonQualTy = QC.strip(ArgType);
4440	QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4441	if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
4442	if (Attr->getCaptureKind() == OMPC_map) {
4443	PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4444	LangAS::opencl_global);
4445	} else if (Attr->getCaptureKind() == OMPC_firstprivate &&
4446	PointeeTy.isConstant(CGM.getContext())) {
4447	PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4448	LangAS::opencl_generic);
4449	}
4450	}
4451	ArgType = CGM.getContext().getPointerType(PointeeTy);
4452	QC.addRestrict();
4453	enum { NVPTX_local_addr = 5 };
4454	QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
4455	ArgType = QC.apply(CGM.getContext(), ArgType);
4456	if (isa<ImplicitParamDecl>(NativeParam))
4457	return ImplicitParamDecl::Create(
4458	CGM.getContext(), /DC=/nullptr, NativeParam->getLocation(),
4459	NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
4460	return ParmVarDecl::Create(
4461	CGM.getContext(),
4462	const_cast<DeclContext *>(NativeParam->getDeclContext()),
4463	NativeParam->getBeginLoc(), NativeParam->getLocation(),
4464	NativeParam->getIdentifier(), ArgType,
4465	/TInfo=/nullptr, SC_None, /DefArg=/nullptr);
4466	}
4467
4468	Address
4469	CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF,
4470	const VarDecl *NativeParam,
4471	const VarDecl *TargetParam) const {
4472	(0) . __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4474, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(NativeParam != TargetParam &&
4473	(0) . __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4474, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> NativeParam->getType()->isReferenceType() &&
4474	(0) . __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4474, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Native arg must not be the same as target arg.");
4475	Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
4476	QualType NativeParamType = NativeParam->getType();
4477	QualifierCollector QC;
4478	const Type *NonQualTy = QC.strip(NativeParamType);
4479	QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4480	unsigned NativePointeeAddrSpace =
4481	CGF.getContext().getTargetAddressSpace(NativePointeeTy);
4482	QualType TargetTy = TargetParam->getType();
4483	llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
4484	LocalAddr, /Volatile=/false, TargetTy, SourceLocation());
4485	// First cast to generic.
4486	TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4487	TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4488	/AddrSpace=/0));
4489	// Cast from generic to native address space.
4490	TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4491	TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4492	NativePointeeAddrSpace));
4493	Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
4494	CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /Volatile=/false,
4495	NativeParamType);
4496	return NativeParamAddr;
4497	}
4498
4499	void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
4500	CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
4501	ArrayRef<llvm::Value *> Args) const {
4502	SmallVector<llvm::Value *, 4> TargetArgs;
4503	TargetArgs.reserve(Args.size());
4504	auto *FnType = OutlinedFn.getFunctionType();
4505	for (unsigned I = 0, E = Args.size(); I < E; ++I) {
4506	if (FnType->isVarArg() && FnType->getNumParams() <= I) {
4507	TargetArgs.append(std::next(Args.begin(), I), Args.end());
4508	break;
4509	}
4510	llvm::Type *TargetType = FnType->getParamType(I);
4511	llvm::Value *NativeArg = Args[I];
4512	if (!TargetType->isPointerTy()) {
4513	TargetArgs.emplace_back(NativeArg);
4514	continue;
4515	}
4516	llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4517	NativeArg,
4518	NativeArg->getType()->getPointerElementType()->getPointerTo());
4519	TargetArgs.emplace_back(
4520	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4521	}
4522	CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
4523	}
4524
4525	/// Emit function which wraps the outline parallel region
4526	/// and controls the arguments which are passed to this function.
4527	/// The wrapper ensures that the outlined function is called
4528	/// with the correct arguments when data is shared.
4529	llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
4530	llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
4531	ASTContext &Ctx = CGM.getContext();
4532	const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4533
4534	// Create a function that takes as argument the source thread.
4535	FunctionArgList WrapperArgs;
4536	QualType Int16QTy =
4537	Ctx.getIntTypeForBitwidth(/DestWidth=/16, /Signed=/false);
4538	QualType Int32QTy =
4539	Ctx.getIntTypeForBitwidth(/DestWidth=/32, /Signed=/false);
4540	ImplicitParamDecl ParallelLevelArg(Ctx, /DC=/nullptr, D.getBeginLoc(),
4541	/Id=/nullptr, Int16QTy,
4542	ImplicitParamDecl::Other);
4543	ImplicitParamDecl WrapperArg(Ctx, /DC=/nullptr, D.getBeginLoc(),
4544	/Id=/nullptr, Int32QTy,
4545	ImplicitParamDecl::Other);
4546	WrapperArgs.emplace_back(&ParallelLevelArg);
4547	WrapperArgs.emplace_back(&WrapperArg);
4548
4549	const CGFunctionInfo &CGFI =
4550	CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
4551
4552	auto *Fn = llvm::Function::Create(
4553	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
4554	Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
4555	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
4556	Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
4557	Fn->setDoesNotRecurse();
4558
4559	CodeGenFunction CGF(CGM, /suppressNewContext=/true);
4560	CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
4561	D.getBeginLoc(), D.getBeginLoc());
4562
4563	const auto *RD = CS.getCapturedRecordDecl();
4564	auto CurField = RD->field_begin();
4565
4566	Address ZeroAddr = CGF.CreateMemTemp(
4567	CGF.getContext().getIntTypeForBitwidth(/DestWidth=/32, /Signed=/1),
4568	/Name/ ".zero.addr");
4569	CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/C/ 0));
4570	// Get the array of arguments.
4571	SmallVector<llvm::Value *, 8> Args;
4572
4573	Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4574	Args.emplace_back(ZeroAddr.getPointer());
4575
4576	CGBuilderTy &Bld = CGF.Builder;
4577	auto CI = CS.capture_begin();
4578
4579	// Use global memory for data sharing.
4580	// Handle passing of global args to workers.
4581	Address GlobalArgs =
4582	CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
4583	llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4584	llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4585	CGF.EmitRuntimeCall(
4586	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
4587	DataSharingArgs);
4588
4589	// Retrieve the shared variables from the list of references returned
4590	// by the runtime. Pass the variables to the outlined function.
4591	Address SharedArgListAddress = Address::invalid();
4592	if (CS.capture_size() > 0 \|\|
4593	isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4594	SharedArgListAddress = CGF.EmitLoadOfPointer(
4595	GlobalArgs, CGF.getContext()
4596	.getPointerType(CGF.getContext().getPointerType(
4597	CGF.getContext().VoidPtrTy))
4598	.castAs<PointerType>());
4599	}
4600	unsigned Idx = 0;
4601	if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4602	Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4603	Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4604	Src, CGF.SizeTy->getPointerTo());
4605	llvm::Value *LB = CGF.EmitLoadOfScalar(
4606	TypedAddress,
4607	/Volatile=/false,
4608	CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4609	cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4610	Args.emplace_back(LB);
4611	++Idx;
4612	Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4613	TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4614	Src, CGF.SizeTy->getPointerTo());
4615	llvm::Value *UB = CGF.EmitLoadOfScalar(
4616	TypedAddress,
4617	/Volatile=/false,
4618	CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4619	cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4620	Args.emplace_back(UB);
4621	++Idx;
4622	}
4623	if (CS.capture_size() > 0) {
4624	ASTContext &CGFContext = CGF.getContext();
4625	for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4626	QualType ElemTy = CurField->getType();
4627	Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
4628	Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4629	Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4630	llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4631	/Volatile=/false,
4632	CGFContext.getPointerType(ElemTy),
4633	CI->getLocation());
4634	if (CI->capturesVariableByCopy() &&
4635	!CI->getCapturedVar()->getType()->isAnyPointerType()) {
4636	Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
4637	CI->getLocation());
4638	}
4639	Args.emplace_back(Arg);
4640	}
4641	}
4642
4643	emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
4644	CGF.FinishFunction();
4645	return Fn;
4646	}
4647
4648	void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
4649	const Decl *D) {
4650	if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
4651	return;
4652
4653	(0) . __assert_fail ("D && \"Expected function or captured\|block decl.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4653, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(D && "Expected function or captured\|block decl.");
4654	(0) . __assert_fail ("FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && \"Function is registered already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4655, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4655	(0) . __assert_fail ("FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && \"Function is registered already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4655, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Function is registered already.");
4656	(0) . __assert_fail ("(!TeamAndReductions.first \|\| TeamAndReductions.first == D) && \"Team is set but not processed.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4657, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert((!TeamAndReductions.first \|\| TeamAndReductions.first == D) &&
4657	(0) . __assert_fail ("(!TeamAndReductions.first \|\| TeamAndReductions.first == D) && \"Team is set but not processed.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4657, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Team is set but not processed.");
4658	const Stmt *Body = nullptr;
4659	bool NeedToDelayGlobalization = false;
4660	if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4661	Body = FD->getBody();
4662	} else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4663	Body = BD->getBody();
4664	} else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4665	Body = CD->getBody();
4666	NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
4667	if (NeedToDelayGlobalization &&
4668	getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
4669	return;
4670	}
4671	if (!Body)
4672	return;
4673	CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
4674	VarChecker.Visit(Body);
4675	const RecordDecl *GlobalizedVarsRecord =
4676	VarChecker.getGlobalizedRecord(IsInTTDRegion);
4677	TeamAndReductions.first = nullptr;
4678	TeamAndReductions.second.clear();
4679	ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4680	VarChecker.getEscapedVariableLengthDecls();
4681	if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
4682	return;
4683	auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4684	I->getSecond().MappedParams =
4685	llvm::make_unique<CodeGenFunction::OMPMapVars>();
4686	I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4687	I->getSecond().EscapedParameters.insert(
4688	VarChecker.getEscapedParameters().begin(),
4689	VarChecker.getEscapedParameters().end());
4690	I->getSecond().EscapedVariableLengthDecls.append(
4691	EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
4692	DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
4693	for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4694	(0) . __assert_fail ("VD->isCanonicalDecl() && \"Expected canonical declaration\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4694, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4695	const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4696	Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
4697	}
4698	if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
4699	CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
4700	VarChecker.Visit(Body);
4701	I->getSecond().SecondaryGlobalRecord =
4702	VarChecker.getGlobalizedRecord(/IsInTTDRegion=/true);
4703	I->getSecond().SecondaryLocalVarData.emplace();
4704	DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4705	for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4706	(0) . __assert_fail ("VD->isCanonicalDecl() && \"Expected canonical declaration\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4706, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4707	const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4708	Data.insert(
4709	std::make_pair(VD, MappedVarData(FD, /IsInTTDRegion=/true)));
4710	}
4711	}
4712	if (!NeedToDelayGlobalization) {
4713	emitGenericVarsProlog(CGF, D->getBeginLoc(), /WithSPMDCheck=/true);
4714	struct GlobalizationScope final : EHScopeStack::Cleanup {
4715	GlobalizationScope() = default;
4716
4717	void Emit(CodeGenFunction &CGF, Flags flags) override {
4718	static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
4719	.emitGenericVarsEpilog(CGF, /WithSPMDCheck=/true);
4720	}
4721	};
4722	CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
4723	}
4724	}
4725
4726	Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
4727	const VarDecl *VD) {
4728	bool UseDefaultAllocator = true;
4729	if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
4730	const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4731	switch (A->getAllocatorType()) {
4732	// Use the default allocator here as by default local vars are
4733	// threadlocal.
4734	case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4735	case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4736	// Just pass-through to check if the globalization is required.
4737	break;
4738	case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4739	case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4740	case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4741	case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4742	case OMPAllocateDeclAttr::OMPConstMemAlloc:
4743	case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4744	case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4745	UseDefaultAllocator = false;
4746	break;
4747	}
4748	}
4749
4750	if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
4751	return Address::invalid();
4752
4753	VD = VD->getCanonicalDecl();
4754	auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4755	if (I == FunctionGlobalizedDecls.end())
4756	return Address::invalid();
4757	auto VDI = I->getSecond().LocalVarData.find(VD);
4758	if (VDI != I->getSecond().LocalVarData.end())
4759	return VDI->second.PrivateAddr;
4760	if (VD->hasAttrs()) {
4761	for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
4762	E(VD->attr_end());
4763	IT != E; ++IT) {
4764	auto VDI = I->getSecond().LocalVarData.find(
4765	cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4766	->getCanonicalDecl());
4767	if (VDI != I->getSecond().LocalVarData.end())
4768	return VDI->second.PrivateAddr;
4769	}
4770	}
4771
4772	// TODO: replace it with return
4773	// UseDefaultAllocator ? Address::invalid :
4774	// CGOpenMPRuntime::getAddressOfLocalVariable(CGF, VD); when NVPTX libomp
4775	// supports __kmpc_alloc\|__kmpc_free.
4776	(void)UseDefaultAllocator; // Prevent a warning.
4777	return Address::invalid();
4778	}
4779
4780	void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) {
4781	FunctionGlobalizedDecls.erase(CGF.CurFn);
4782	CGOpenMPRuntime::functionFinished(CGF);
4783	}
4784
4785	void CGOpenMPRuntimeNVPTX::getDefaultDistScheduleAndChunk(
4786	CodeGenFunction &CGF, const OMPLoopDirective &S,
4787	OpenMPDistScheduleClauseKind &ScheduleKind,
4788	llvm::Value *&Chunk) const {
4789	if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
4790	ScheduleKind = OMPC_DIST_SCHEDULE_static;
4791	Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
4792	CGF.getContext().getIntTypeForBitwidth(32, /Signed=/0),
4793	S.getIterationVariable()->getType(), S.getBeginLoc());
4794	return;
4795	}
4796	CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
4797	CGF, S, ScheduleKind, Chunk);
4798	}
4799
4800	void CGOpenMPRuntimeNVPTX::getDefaultScheduleAndChunk(
4801	CodeGenFunction &CGF, const OMPLoopDirective &S,
4802	OpenMPScheduleClauseKind &ScheduleKind,
4803	const Expr *&ChunkExpr) const {
4804	ScheduleKind = OMPC_SCHEDULE_static;
4805	// Chunk size is 1 in this case.
4806	llvm::APInt ChunkSize(32, 1);
4807	ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4808	CGF.getContext().getIntTypeForBitwidth(32, /Signed=/0),
4809	SourceLocation());
4810	}
4811
4812	void CGOpenMPRuntimeNVPTX::adjustTargetSpecificDataForLambdas(
4813	CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
4814	(0) . __assert_fail ("isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && \" Expected target-based directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4815, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
4815	(0) . __assert_fail ("isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && \" Expected target-based directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4815, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> " Expected target-based directive.");
4816	const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4817	for (const CapturedStmt::Capture &C : CS->captures()) {
4818	// Capture variables captured by reference in lambdas for target-based
4819	// directives.
4820	if (!C.capturesVariable())
4821	continue;
4822	const VarDecl *VD = C.getCapturedVar();
4823	const auto *RD = VD->getType()
4824	.getCanonicalType()
4825	.getNonReferenceType()
4826	->getAsCXXRecordDecl();
4827	if (!RD \|\| !RD->isLambda())
4828	continue;
4829	Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4830	LValue VDLVal;
4831	if (VD->getType().getCanonicalType()->isReferenceType())
4832	VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4833	else
4834	VDLVal = CGF.MakeAddrLValue(
4835	VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4836	llvm::DenseMap<const VarDecl , FieldDecl > Captures;
4837	FieldDecl *ThisCapture = nullptr;
4838	RD->getCaptureFields(Captures, ThisCapture);
4839	if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4840	LValue ThisLVal =
4841	CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4842	llvm::Value *CXXThis = CGF.LoadCXXThis();
4843	CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4844	}
4845	for (const LambdaCapture &LC : RD->captures()) {
4846	if (LC.getCaptureKind() != LCK_ByRef)
4847	continue;
4848	const VarDecl *VD = LC.getCapturedVar();
4849	if (!CS->capturesVariable(VD))
4850	continue;
4851	auto It = Captures.find(VD);
4852	(0) . __assert_fail ("It != Captures.end() && \"Found lambda capture without field.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4852, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(It != Captures.end() && "Found lambda capture without field.");
4853	LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4854	Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4855	if (VD->getType().getCanonicalType()->isReferenceType())
4856	VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4857	VD->getType().getCanonicalType())
4858	.getAddress();
4859	CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4860	}
4861	}
4862	}
4863
4864	unsigned CGOpenMPRuntimeNVPTX::getDefaultFirstprivateAddressSpace() const {
4865	return CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant);
4866	}
4867
4868	bool CGOpenMPRuntimeNVPTX::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
4869	LangAS &AS) {
4870	if (!VD \|\| !VD->hasAttr<OMPAllocateDeclAttr>())
4871	return false;
4872	const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4873	switch(A->getAllocatorType()) {
4874	case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4875	// Not supported, fallback to the default mem space.
4876	case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4877	case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4878	case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4879	case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4880	case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4881	AS = LangAS::Default;
4882	return true;
4883	case OMPAllocateDeclAttr::OMPConstMemAlloc:
4884	AS = LangAS::cuda_constant;
4885	return true;
4886	case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4887	AS = LangAS::cuda_shared;
4888	return true;
4889	case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4890	llvm_unreachable("Expected predefined allocator for the variables with the "
4891	"static storage.");
4892	}
4893	return false;
4894	}
4895
4896	// Get current CudaArch and ignore any unknown values
4897	static CudaArch getCudaArch(CodeGenModule &CGM) {
4898	if (!CGM.getTarget().hasFeature("ptx"))
4899	return CudaArch::UNKNOWN;
4900	llvm::StringMap<bool> Features;
4901	CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
4902	CGM.getTarget().getTargetOpts().CPU,
4903	CGM.getTarget().getTargetOpts().Features);
4904	for (const auto &Feature : Features) {
4905	if (Feature.getValue()) {
4906	CudaArch Arch = StringToCudaArch(Feature.getKey());
4907	if (Arch != CudaArch::UNKNOWN)
4908	return Arch;
4909	}
4910	}
4911	return CudaArch::UNKNOWN;
4912	}
4913
4914	/// Check to see if target architecture supports unified addressing which is
4915	/// a restriction for OpenMP requires clause "unified_shared_memory".
4916	void CGOpenMPRuntimeNVPTX::checkArchForUnifiedAddressing(
4917	const OMPRequiresDecl *D) const {
4918	for (const OMPClause *Clause : D->clauselists()) {
4919	if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
4920	switch (getCudaArch(CGM)) {
4921	case CudaArch::SM_20:
4922	case CudaArch::SM_21:
4923	case CudaArch::SM_30:
4924	case CudaArch::SM_32:
4925	case CudaArch::SM_35:
4926	case CudaArch::SM_37:
4927	case CudaArch::SM_50:
4928	case CudaArch::SM_52:
4929	case CudaArch::SM_53:
4930	case CudaArch::SM_60:
4931	case CudaArch::SM_61:
4932	case CudaArch::SM_62:
4933	CGM.Error(Clause->getBeginLoc(),
4934	"Target architecture does not support unified addressing");
4935	return;
4936	case CudaArch::SM_70:
4937	case CudaArch::SM_72:
4938	case CudaArch::SM_75:
4939	case CudaArch::GFX600:
4940	case CudaArch::GFX601:
4941	case CudaArch::GFX700:
4942	case CudaArch::GFX701:
4943	case CudaArch::GFX702:
4944	case CudaArch::GFX703:
4945	case CudaArch::GFX704:
4946	case CudaArch::GFX801:
4947	case CudaArch::GFX802:
4948	case CudaArch::GFX803:
4949	case CudaArch::GFX810:
4950	case CudaArch::GFX900:
4951	case CudaArch::GFX902:
4952	case CudaArch::GFX904:
4953	case CudaArch::GFX906:
4954	case CudaArch::GFX909:
4955	case CudaArch::UNKNOWN:
4956	break;
4957	case CudaArch::LAST:
4958	llvm_unreachable("Unexpected Cuda arch.");
4959	}
4960	}
4961	}
4962	}
4963
4964	/// Get number of SMs and number of blocks per SM.
4965	static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
4966	std::pair<unsigned, unsigned> Data;
4967	if (CGM.getLangOpts().OpenMPCUDANumSMs)
4968	Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
4969	if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
4970	Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
4971	if (Data.first && Data.second)
4972	return Data;
4973	switch (getCudaArch(CGM)) {
4974	case CudaArch::SM_20:
4975	case CudaArch::SM_21:
4976	case CudaArch::SM_30:
4977	case CudaArch::SM_32:
4978	case CudaArch::SM_35:
4979	case CudaArch::SM_37:
4980	case CudaArch::SM_50:
4981	case CudaArch::SM_52:
4982	case CudaArch::SM_53:
4983	return {16, 16};
4984	case CudaArch::SM_60:
4985	case CudaArch::SM_61:
4986	case CudaArch::SM_62:
4987	return {56, 32};
4988	case CudaArch::SM_70:
4989	case CudaArch::SM_72:
4990	case CudaArch::SM_75:
4991	return {84, 32};
4992	case CudaArch::GFX600:
4993	case CudaArch::GFX601:
4994	case CudaArch::GFX700:
4995	case CudaArch::GFX701:
4996	case CudaArch::GFX702:
4997	case CudaArch::GFX703:
4998	case CudaArch::GFX704:
4999	case CudaArch::GFX801:
5000	case CudaArch::GFX802:
5001	case CudaArch::GFX803:
5002	case CudaArch::GFX810:
5003	case CudaArch::GFX900:
5004	case CudaArch::GFX902:
5005	case CudaArch::GFX904:
5006	case CudaArch::GFX906:
5007	case CudaArch::GFX909:
5008	case CudaArch::UNKNOWN:
5009	break;
5010	case CudaArch::LAST:
5011	llvm_unreachable("Unexpected Cuda arch.");
5012	}
5013	llvm_unreachable("Unexpected NVPTX target without ptx feature.");
5014	}
5015
5016	void CGOpenMPRuntimeNVPTX::clear() {
5017	if (!GlobalizedRecords.empty()) {
5018	ASTContext &C = CGM.getContext();
5019	llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
5020	llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
5021	RecordDecl *StaticRD = C.buildImplicitRecord(
5022	"_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
5023	StaticRD->startDefinition();
5024	RecordDecl *SharedStaticRD = C.buildImplicitRecord(
5025	"_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
5026	SharedStaticRD->startDefinition();
5027	for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
5028	if (Records.Records.empty())
5029	continue;
5030	unsigned Size = 0;
5031	unsigned RecAlignment = 0;
5032	for (const RecordDecl *RD : Records.Records) {
5033	QualType RDTy = C.getRecordType(RD);
5034	unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
5035	RecAlignment = std::max(RecAlignment, Alignment);
5036	unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
5037	Size =
5038	llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
5039	}
5040	Size = llvm::alignTo(Size, RecAlignment);
5041	llvm::APInt ArySize(/numBits=/64, Size);
5042	QualType SubTy = C.getConstantArrayType(
5043	C.CharTy, ArySize, ArrayType::Normal, /IndexTypeQuals=/0);
5044	const bool UseSharedMemory = Size <= SharedMemorySize;
5045	auto *Field =
5046	FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
5047	SourceLocation(), SourceLocation(), nullptr, SubTy,
5048	C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
5049	/BW=/nullptr, /Mutable=/false,
5050	/InitStyle=/ICIS_NoInit);
5051	Field->setAccess(AS_public);
5052	if (UseSharedMemory) {
5053	SharedStaticRD->addDecl(Field);
5054	SharedRecs.push_back(&Records);
5055	} else {
5056	StaticRD->addDecl(Field);
5057	GlobalRecs.push_back(&Records);
5058	}
5059	Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
5060	Records.UseSharedMemory->setInitializer(
5061	llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
5062	}
5063	// Allocate SharedMemorySize buffer for the shared memory.
5064	// FIXME: nvlink does not handle weak linkage correctly (object with the
5065	// different size are reported as erroneous).
5066	// Restore this code as sson as nvlink is fixed.
5067	if (!SharedStaticRD->field_empty()) {
5068	llvm::APInt ArySize(/numBits=/64, SharedMemorySize);
5069	QualType SubTy = C.getConstantArrayType(
5070	C.CharTy, ArySize, ArrayType::Normal, /IndexTypeQuals=/0);
5071	auto *Field = FieldDecl::Create(
5072	C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
5073	C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
5074	/BW=/nullptr, /Mutable=/false,
5075	/InitStyle=/ICIS_NoInit);
5076	Field->setAccess(AS_public);
5077	SharedStaticRD->addDecl(Field);
5078	}
5079	SharedStaticRD->completeDefinition();
5080	if (!SharedStaticRD->field_empty()) {
5081	QualType StaticTy = C.getRecordType(SharedStaticRD);
5082	llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
5083	auto *GV = new llvm::GlobalVariable(
5084	CGM.getModule(), LLVMStaticTy,
5085	/isConstant=/false, llvm::GlobalValue::CommonLinkage,
5086	llvm::Constant::getNullValue(LLVMStaticTy),
5087	"_openmp_shared_static_glob_rd_$_", /InsertBefore=/nullptr,
5088	llvm::GlobalValue::NotThreadLocal,
5089	C.getTargetAddressSpace(LangAS::cuda_shared));
5090	auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
5091	GV, CGM.VoidPtrTy);
5092	for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
5093	Rec->Buffer->replaceAllUsesWith(Replacement);
5094	Rec->Buffer->eraseFromParent();
5095	}
5096	}
5097	StaticRD->completeDefinition();
5098	if (!StaticRD->field_empty()) {
5099	QualType StaticTy = C.getRecordType(StaticRD);
5100	std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
5101	llvm::APInt Size1(32, SMsBlockPerSM.second);
5102	QualType Arr1Ty =
5103	C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal,
5104	/IndexTypeQuals=/0);
5105	llvm::APInt Size2(32, SMsBlockPerSM.first);
5106	QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal,
5107	/IndexTypeQuals=/0);
5108	llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
5109	// FIXME: nvlink does not handle weak linkage correctly (object with the
5110	// different size are reported as erroneous).
5111	// Restore CommonLinkage as soon as nvlink is fixed.
5112	auto *GV = new llvm::GlobalVariable(
5113	CGM.getModule(), LLVMArr2Ty,
5114	/isConstant=/false, llvm::GlobalValue::InternalLinkage,
5115	llvm::Constant::getNullValue(LLVMArr2Ty),
5116	"_openmp_static_glob_rd_$_");
5117	auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
5118	GV, CGM.VoidPtrTy);
5119	for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
5120	Rec->Buffer->replaceAllUsesWith(Replacement);
5121	Rec->Buffer->eraseFromParent();
5122	}
5123	}
5124	}
5125	if (!TeamsReductions.empty()) {
5126	ASTContext &C = CGM.getContext();
5127	RecordDecl *StaticRD = C.buildImplicitRecord(
5128	"_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
5129	StaticRD->startDefinition();
5130	for (const RecordDecl *TeamReductionRec : TeamsReductions) {
5131	QualType RecTy = C.getRecordType(TeamReductionRec);
5132	auto *Field = FieldDecl::Create(
5133	C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
5134	C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
5135	/BW=/nullptr, /Mutable=/false,
5136	/InitStyle=/ICIS_NoInit);
5137	Field->setAccess(AS_public);
5138	StaticRD->addDecl(Field);
5139	}
5140	StaticRD->completeDefinition();
5141	QualType StaticTy = C.getRecordType(StaticRD);
5142	llvm::Type *LLVMReductionsBufferTy =
5143	CGM.getTypes().ConvertTypeForMem(StaticTy);
5144	// FIXME: nvlink does not handle weak linkage correctly (object with the
5145	// different size are reported as erroneous).
5146	// Restore CommonLinkage as soon as nvlink is fixed.
5147	auto *GV = new llvm::GlobalVariable(
5148	CGM.getModule(), LLVMReductionsBufferTy,
5149	/isConstant=/false, llvm::GlobalValue::InternalLinkage,
5150	llvm::Constant::getNullValue(LLVMReductionsBufferTy),
5151	"_openmp_teams_reductions_buffer_$_");
5152	KernelTeamsReductionPtr->setInitializer(
5153	llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
5154	CGM.VoidPtrTy));
5155	}
5156	CGOpenMPRuntime::clear();
5157	}
5158