CGCUDANV.cpp source code [clang_source_code/lib/CodeGen/CGCUDANV.cpp]

1	//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10	// runtime library.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "CGCUDARuntime.h"
15	#include "CodeGenFunction.h"
16	#include "CodeGenModule.h"
17	#include "clang/AST/Decl.h"
18	#include "clang/Basic/Cuda.h"
19	#include "clang/CodeGen/CodeGenABITypes.h"
20	#include "clang/CodeGen/ConstantInitBuilder.h"
21	#include "llvm/IR/BasicBlock.h"
22	#include "llvm/IR/Constants.h"
23	#include "llvm/IR/DerivedTypes.h"
24	#include "llvm/Support/Format.h"
25
26	using namespace clang;
27	using namespace CodeGen;
28
29	namespace {
30	constexpr unsigned CudaFatMagic = 0x466243b1;
31	constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
32
33	class CGNVCUDARuntime : public CGCUDARuntime {
34
35	private:
36	llvm::IntegerType IntTy, SizeTy;
37	llvm::Type *VoidTy;
38	llvm::PointerType CharPtrTy, VoidPtrTy, *VoidPtrPtrTy;
39
40	/// Convenience reference to LLVM Context
41	llvm::LLVMContext &Context;
42	/// Convenience reference to the current module
43	llvm::Module &TheModule;
44	/// Keeps track of kernel launch stubs emitted in this module
45	struct KernelInfo {
46	llvm::Function *Kernel;
47	const Decl *D;
48	};
49	llvm::SmallVector<KernelInfo, 16> EmittedKernels;
50	struct VarInfo {
51	llvm::GlobalVariable *Var;
52	const VarDecl *D;
53	unsigned Flag;
54	};
55	llvm::SmallVector<VarInfo, 16> DeviceVars;
56	/// Keeps track of variable containing handle of GPU binary. Populated by
57	/// ModuleCtorFunction() and used to create corresponding cleanup calls in
58	/// ModuleDtorFunction()
59	llvm::GlobalVariable *GpuBinaryHandle = nullptr;
60	/// Whether we generate relocatable device code.
61	bool RelocatableDeviceCode;
62	/// Mangle context for device.
63	std::unique_ptr<MangleContext> DeviceMC;
64
65	llvm::FunctionCallee getSetupArgumentFn() const;
66	llvm::FunctionCallee getLaunchFn() const;
67
68	llvm::FunctionType *getRegisterGlobalsFnTy() const;
69	llvm::FunctionType *getCallbackFnTy() const;
70	llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
71	std::string addPrefixToName(StringRef FuncName) const;
72	std::string addUnderscoredPrefixToName(StringRef FuncName) const;
73
74	/// Creates a function to register all kernel stubs generated in this module.
75	llvm::Function *makeRegisterGlobalsFn();
76
77	/// Helper function that generates a constant string and returns a pointer to
78	/// the start of the string. The result of this function can be used anywhere
79	/// where the C code specifies const char*.
80	llvm::Constant *makeConstantString(const std::string &Str,
81	const std::string &Name = "",
82	const std::string &SectionName = "",
83	unsigned Alignment = 0) {
84	llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
85	llvm::ConstantInt::get(SizeTy, 0)};
86	auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
87	llvm::GlobalVariable *GV =
88	cast<llvm::GlobalVariable>(ConstStr.getPointer());
89	if (!SectionName.empty()) {
90	GV->setSection(SectionName);
91	// Mark the address as used which make sure that this section isn't
92	// merged and we will really have it in the object file.
93	GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
94	}
95	if (Alignment)
96	GV->setAlignment(Alignment);
97
98	return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
99	ConstStr.getPointer(), Zeros);
100	}
101
102	/// Helper function that generates an empty dummy function returning void.
103	llvm::Function makeDummyFunction(llvm::FunctionType FnTy) {
104	(0) . __assert_fail ("FnTy->getReturnType()->isVoidTy() && \"Can only generate dummy functions returning void!\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 105, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(FnTy->getReturnType()->isVoidTy() &&
105	(0) . __assert_fail ("FnTy->getReturnType()->isVoidTy() && \"Can only generate dummy functions returning void!\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 105, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Can only generate dummy functions returning void!");
106	llvm::Function *DummyFunc = llvm::Function::Create(
107	FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
108
109	llvm::BasicBlock *DummyBlock =
110	llvm::BasicBlock::Create(Context, "", DummyFunc);
111	CGBuilderTy FuncBuilder(CGM, Context);
112	FuncBuilder.SetInsertPoint(DummyBlock);
113	FuncBuilder.CreateRetVoid();
114
115	return DummyFunc;
116	}
117
118	void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
119	void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
120	std::string getDeviceSideName(const Decl *ND);
121
122	public:
123	CGNVCUDARuntime(CodeGenModule &CGM);
124
125	void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
126	void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
127	unsigned Flags) override {
128	DeviceVars.push_back({&Var, VD, Flags});
129	}
130
131	/// Creates module constructor function
132	llvm::Function *makeModuleCtorFunction() override;
133	/// Creates module destructor function
134	llvm::Function *makeModuleDtorFunction() override;
135	};
136
137	}
138
139	std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
140	if (CGM.getLangOpts().HIP)
141	return ((Twine("hip") + Twine(FuncName)).str());
142	return ((Twine("cuda") + Twine(FuncName)).str());
143	}
144	std::string
145	CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
146	if (CGM.getLangOpts().HIP)
147	return ((Twine("__hip") + Twine(FuncName)).str());
148	return ((Twine("__cuda") + Twine(FuncName)).str());
149	}
150
151	CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
152	: CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
153	TheModule(CGM.getModule()),
154	RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
155	DeviceMC(CGM.getContext().createMangleContext(
156	CGM.getContext().getAuxTargetInfo())) {
157	CodeGen::CodeGenTypes &Types = CGM.getTypes();
158	ASTContext &Ctx = CGM.getContext();
159
160	IntTy = CGM.IntTy;
161	SizeTy = CGM.SizeTy;
162	VoidTy = CGM.VoidTy;
163
164	CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
165	VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
166	VoidPtrPtrTy = VoidPtrTy->getPointerTo();
167	}
168
169	llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
170	// cudaError_t cudaSetupArgument(void *, size_t, size_t)
171	llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
172	return CGM.CreateRuntimeFunction(
173	llvm::FunctionType::get(IntTy, Params, false),
174	addPrefixToName("SetupArgument"));
175	}
176
177	llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
178	if (CGM.getLangOpts().HIP) {
179	// hipError_t hipLaunchByPtr(char *);
180	return CGM.CreateRuntimeFunction(
181	llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
182	} else {
183	// cudaError_t cudaLaunch(char *);
184	return CGM.CreateRuntimeFunction(
185	llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
186	}
187	}
188
189	llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
190	return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
191	}
192
193	llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
194	return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
195	}
196
197	llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
198	auto CallbackFnTy = getCallbackFnTy();
199	auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
200	llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
201	VoidPtrTy, CallbackFnTy->getPointerTo()};
202	return llvm::FunctionType::get(VoidTy, Params, false);
203	}
204
205	std::string CGNVCUDARuntime::getDeviceSideName(const Decl *D) {
206	auto *ND = cast<const NamedDecl>(D);
207	std::string DeviceSideName;
208	if (DeviceMC->shouldMangleDeclName(ND)) {
209	SmallString<256> Buffer;
210	llvm::raw_svector_ostream Out(Buffer);
211	DeviceMC->mangleName(ND, Out);
212	DeviceSideName = Out.str();
213	} else
214	DeviceSideName = ND->getIdentifier()->getName();
215	return DeviceSideName;
216	}
217
218	void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
219	FunctionArgList &Args) {
220	getName() \|\| CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()) ? static_cast (0) . __assert_fail ("getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() \|\| getDeviceSideName(CGF.CurFuncDecl) + \".stub\" == CGF.CurFn->getName() \|\| CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 223, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() \|\|
221	getName() \|\| CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()) ? static_cast (0) . __assert_fail ("getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() \|\| getDeviceSideName(CGF.CurFuncDecl) + \".stub\" == CGF.CurFn->getName() \|\| CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 223, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> getDeviceSideName(CGF.CurFuncDecl) + ".stub" == CGF.CurFn->getName() \|\|
222	getName() \|\| CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()) ? static_cast (0) . __assert_fail ("getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() \|\| getDeviceSideName(CGF.CurFuncDecl) + \".stub\" == CGF.CurFn->getName() \|\| CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 223, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> CGF.CGM.getContext().getTargetInfo().getCXXABI() !=
223	getName() \|\| CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()) ? static_cast (0) . __assert_fail ("getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() \|\| getDeviceSideName(CGF.CurFuncDecl) + \".stub\" == CGF.CurFn->getName() \|\| CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 223, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI());
224
225	EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
226	if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
227	CudaFeature::CUDA_USES_NEW_LAUNCH))
228	emitDeviceStubBodyNew(CGF, Args);
229	else
230	emitDeviceStubBodyLegacy(CGF, Args);
231	}
232
233	// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
234	// array and kernels are launched using cudaLaunchKernel().
235	void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
236	FunctionArgList &Args) {
237	// Build the shadow stack entry at the very start of the function.
238
239	// Calculate amount of space we will need for all arguments. If we have no
240	// args, allocate a single pointer so we still have a valid pointer to the
241	// argument array that we can pass to runtime, even if it will be unused.
242	Address KernelArgs = CGF.CreateTempAlloca(
243	VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
244	llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
245	// Store pointers to the arguments in a locally allocated launch_args.
246	for (unsigned i = 0; i < Args.size(); ++i) {
247	llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
248	llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
249	CGF.Builder.CreateDefaultAlignedStore(
250	VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
251	}
252
253	llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
254
255	// Lookup cudaLaunchKernel function.
256	// cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
257	// void **args, size_t sharedMem,
258	// cudaStream_t stream);
259	TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
260	DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
261	IdentifierInfo &cudaLaunchKernelII =
262	CGM.getContext().Idents.get("cudaLaunchKernel");
263	FunctionDecl *cudaLaunchKernelFD = nullptr;
264	for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
265	if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
266	cudaLaunchKernelFD = FD;
267	}
268
269	if (cudaLaunchKernelFD == nullptr) {
270	CGM.Error(CGF.CurFuncDecl->getLocation(),
271	"Can't find declaration for cudaLaunchKernel()");
272	return;
273	}
274	// Create temporary dim3 grid_dim, block_dim.
275	ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
276	QualType Dim3Ty = GridDimParam->getType();
277	Address GridDim =
278	CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
279	Address BlockDim =
280	CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
281	Address ShmemSize =
282	CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
283	Address Stream =
284	CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
285	llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
286	llvm::FunctionType::get(IntTy,
287	{/gridDim=/GridDim.getType(),
288	/blockDim=/BlockDim.getType(),
289	/ShmemSize=/ShmemSize.getType(),
290	/Stream=/Stream.getType()},
291	/isVarArg=/false),
292	"__cudaPopCallConfiguration");
293
294	CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
295	{GridDim.getPointer(), BlockDim.getPointer(),
296	ShmemSize.getPointer(), Stream.getPointer()});
297
298	// Emit the call to cudaLaunch
299	llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
300	CallArgList LaunchKernelArgs;
301	LaunchKernelArgs.add(RValue::get(Kernel),
302	cudaLaunchKernelFD->getParamDecl(0)->getType());
303	LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
304	LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
305	LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
306	cudaLaunchKernelFD->getParamDecl(3)->getType());
307	LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
308	cudaLaunchKernelFD->getParamDecl(4)->getType());
309	LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
310	cudaLaunchKernelFD->getParamDecl(5)->getType());
311
312	QualType QT = cudaLaunchKernelFD->getType();
313	QualType CQT = QT.getCanonicalType();
314	llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
315	llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
316
317	const CGFunctionInfo &FI =
318	CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
319	llvm::FunctionCallee cudaLaunchKernelFn =
320	CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel");
321	CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
322	LaunchKernelArgs);
323	CGF.EmitBranch(EndBlock);
324
325	CGF.EmitBlock(EndBlock);
326	}
327
328	void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
329	FunctionArgList &Args) {
330	// Emit a call to cudaSetupArgument for each arg in Args.
331	llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
332	llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
333	CharUnits Offset = CharUnits::Zero();
334	for (const VarDecl *A : Args) {
335	CharUnits TyWidth, TyAlign;
336	std::tie(TyWidth, TyAlign) =
337	CGM.getContext().getTypeInfoInChars(A->getType());
338	Offset = Offset.alignTo(TyAlign);
339	llvm::Value *Args[] = {
340	CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
341	VoidPtrTy),
342	llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()),
343	llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
344	};
345	llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
346	llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
347	llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
348	llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
349	CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
350	CGF.EmitBlock(NextBlock);
351	Offset += TyWidth;
352	}
353
354	// Emit the call to cudaLaunch
355	llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
356	llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
357	CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
358	CGF.EmitBranch(EndBlock);
359
360	CGF.EmitBlock(EndBlock);
361	}
362
363	/// Creates a function that sets up state on the host side for CUDA objects that
364	/// have a presence on both the host and device sides. Specifically, registers
365	/// the host side of kernel functions and device global variables with the CUDA
366	/// runtime.
367	/// \code
368	/// void __cuda_register_globals(void** GpuBinaryHandle) {
369	/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
370	/// ...
371	/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
372	/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
373	/// ...
374	/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
375	/// }
376	/// \endcode
377	llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
378	// No need to register anything
379	if (EmittedKernels.empty() && DeviceVars.empty())
380	return nullptr;
381
382	llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
383	getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
384	addUnderscoredPrefixToName("_register_globals"), &TheModule);
385	llvm::BasicBlock *EntryBB =
386	llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
387	CGBuilderTy Builder(CGM, Context);
388	Builder.SetInsertPoint(EntryBB);
389
390	// void __cudaRegisterFunction(void *, const char , char , const char ,
391	// int, uint3, uint3, dim3, dim3, int*)
392	llvm::Type *RegisterFuncParams[] = {
393	VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
394	VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
395	llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
396	llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
397	addUnderscoredPrefixToName("RegisterFunction"));
398
399	// Extract GpuBinaryHandle passed as the first argument passed to
400	// __cuda_register_globals() and generate __cudaRegisterFunction() call for
401	// each emitted kernel.
402	llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
403	for (auto &&I : EmittedKernels) {
404	llvm::Constant *KernelName = makeConstantString(getDeviceSideName(I.D));
405	llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
406	llvm::Value *Args[] = {
407	&GpuBinaryHandlePtr,
408	Builder.CreateBitCast(I.Kernel, VoidPtrTy),
409	KernelName,
410	KernelName,
411	llvm::ConstantInt::get(IntTy, -1),
412	NullPtr,
413	NullPtr,
414	NullPtr,
415	NullPtr,
416	llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
417	Builder.CreateCall(RegisterFunc, Args);
418	}
419
420	// void __cudaRegisterVar(void *, char , char , const char ,
421	// int, int, int, int)
422	llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
423	CharPtrTy, IntTy, IntTy,
424	IntTy, IntTy};
425	llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
426	llvm::FunctionType::get(IntTy, RegisterVarParams, false),
427	addUnderscoredPrefixToName("RegisterVar"));
428	for (auto &&Info : DeviceVars) {
429	llvm::GlobalVariable *Var = Info.Var;
430	unsigned Flags = Info.Flag;
431	llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
432	uint64_t VarSize =
433	CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
434	llvm::Value *Args[] = {
435	&GpuBinaryHandlePtr,
436	Builder.CreateBitCast(Var, VoidPtrTy),
437	VarName,
438	VarName,
439	llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0),
440	llvm::ConstantInt::get(IntTy, VarSize),
441	llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0),
442	llvm::ConstantInt::get(IntTy, 0)};
443	Builder.CreateCall(RegisterVar, Args);
444	}
445
446	Builder.CreateRetVoid();
447	return RegisterKernelsFunc;
448	}
449
450	/// Creates a global constructor function for the module:
451	///
452	/// For CUDA:
453	/// \code
454	/// void __cuda_module_ctor(void*) {
455	/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
456	/// __cuda_register_globals(Handle);
457	/// }
458	/// \endcode
459	///
460	/// For HIP:
461	/// \code
462	/// void __hip_module_ctor(void*) {
463	/// if (__hip_gpubin_handle == 0) {
464	/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
465	/// __hip_register_globals(__hip_gpubin_handle);
466	/// }
467	/// }
468	/// \endcode
469	llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
470	bool IsHIP = CGM.getLangOpts().HIP;
471	// No need to generate ctors/dtors if there is no GPU binary.
472	StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
473	if (CudaGpuBinaryFileName.empty() && !IsHIP)
474	return nullptr;
475
476	// void __{cuda\|hip}_register_globals(void* handle);
477	llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
478	// We always need a function to pass in as callback. Create a dummy
479	// implementation if we don't need to register anything.
480	if (RelocatableDeviceCode && !RegisterGlobalsFunc)
481	RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
482
483	// void ** __{cuda\|hip}RegisterFatBinary(void *);
484	llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
485	llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
486	addUnderscoredPrefixToName("RegisterFatBinary"));
487	// struct { int magic, int version, void * gpu_binary, void * dont_care };
488	llvm::StructType *FatbinWrapperTy =
489	llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
490
491	// Register GPU binary with the CUDA runtime, store returned handle in a
492	// global variable and save a reference in GpuBinaryHandle to be cleaned up
493	// in destructor on exit. Then associate all known kernels with the GPU binary
494	// handle so CUDA runtime can figure out what to call on the GPU side.
495	std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
496	if (!CudaGpuBinaryFileName.empty()) {
497	llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
498	llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
499	if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
500	CGM.getDiags().Report(diag::err_cannot_open_file)
501	<< CudaGpuBinaryFileName << EC.message();
502	return nullptr;
503	}
504	CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
505	}
506
507	llvm::Function *ModuleCtorFunc = llvm::Function::Create(
508	llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
509	llvm::GlobalValue::InternalLinkage,
510	addUnderscoredPrefixToName("_module_ctor"), &TheModule);
511	llvm::BasicBlock *CtorEntryBB =
512	llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
513	CGBuilderTy CtorBuilder(CGM, Context);
514
515	CtorBuilder.SetInsertPoint(CtorEntryBB);
516
517	const char *FatbinConstantName;
518	const char *FatbinSectionName;
519	const char *ModuleIDSectionName;
520	StringRef ModuleIDPrefix;
521	llvm::Constant *FatBinStr;
522	unsigned FatMagic;
523	if (IsHIP) {
524	FatbinConstantName = ".hip_fatbin";
525	FatbinSectionName = ".hipFatBinSegment";
526
527	ModuleIDSectionName = "__hip_module_id";
528	ModuleIDPrefix = "__hip_";
529
530	if (CudaGpuBinary) {
531	// If fatbin is available from early finalization, create a string
532	// literal containing the fat binary loaded from the given file.
533	FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
534	FatbinConstantName, 8);
535	} else {
536	// If fatbin is not available, create an external symbol
537	// __hip_fatbin in section .hip_fatbin. The external symbol is supposed
538	// to contain the fat binary but will be populated somewhere else,
539	// e.g. by lld through link script.
540	FatBinStr = new llvm::GlobalVariable(
541	CGM.getModule(), CGM.Int8Ty,
542	/isConstant=/true, llvm::GlobalValue::ExternalLinkage, nullptr,
543	"__hip_fatbin", nullptr,
544	llvm::GlobalVariable::NotThreadLocal);
545	cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
546	}
547
548	FatMagic = HIPFatMagic;
549	} else {
550	if (RelocatableDeviceCode)
551	FatbinConstantName = CGM.getTriple().isMacOSX()
552	? "__NV_CUDA,__nv_relfatbin"
553	: "__nv_relfatbin";
554	else
555	FatbinConstantName =
556	CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
557	// NVIDIA's cuobjdump looks for fatbins in this section.
558	FatbinSectionName =
559	CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
560
561	ModuleIDSectionName = CGM.getTriple().isMacOSX()
562	? "__NV_CUDA,__nv_module_id"
563	: "__nv_module_id";
564	ModuleIDPrefix = "__nv_";
565
566	// For CUDA, create a string literal containing the fat binary loaded from
567	// the given file.
568	FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
569	FatbinConstantName, 8);
570	FatMagic = CudaFatMagic;
571	}
572
573	// Create initialized wrapper structure that points to the loaded GPU binary
574	ConstantInitBuilder Builder(CGM);
575	auto Values = Builder.beginStruct(FatbinWrapperTy);
576	// Fatbin wrapper magic.
577	Values.addInt(IntTy, FatMagic);
578	// Fatbin version.
579	Values.addInt(IntTy, 1);
580	// Data.
581	Values.add(FatBinStr);
582	// Unused in fatbin v1.
583	Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
584	llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
585	addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
586	/constant/ true);
587	FatbinWrapper->setSection(FatbinSectionName);
588
589	// There is only one HIP fat binary per linked module, however there are
590	// multiple constructor functions. Make sure the fat binary is registered
591	// only once. The constructor functions are executed by the dynamic loader
592	// before the program gains control. The dynamic loader cannot execute the
593	// constructor functions concurrently since doing that would not guarantee
594	// thread safety of the loaded program. Therefore we can assume sequential
595	// execution of constructor functions here.
596	if (IsHIP) {
597	auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
598	llvm::GlobalValue::LinkOnceAnyLinkage;
599	llvm::BasicBlock *IfBlock =
600	llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
601	llvm::BasicBlock *ExitBlock =
602	llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
603	// The name, size, and initialization pattern of this variable is part
604	// of HIP ABI.
605	GpuBinaryHandle = new llvm::GlobalVariable(
606	TheModule, VoidPtrPtrTy, /isConstant=/false,
607	Linkage,
608	/Initializer=/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
609	"__hip_gpubin_handle");
610	GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
611	// Prevent the weak symbol in different shared libraries being merged.
612	if (Linkage != llvm::GlobalValue::InternalLinkage)
613	GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
614	Address GpuBinaryAddr(
615	GpuBinaryHandle,
616	CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
617	{
618	auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
619	llvm::Constant *Zero =
620	llvm::Constant::getNullValue(HandleValue->getType());
621	llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
622	CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
623	}
624	{
625	CtorBuilder.SetInsertPoint(IfBlock);
626	// GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
627	llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
628	RegisterFatbinFunc,
629	CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
630	CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
631	CtorBuilder.CreateBr(ExitBlock);
632	}
633	{
634	CtorBuilder.SetInsertPoint(ExitBlock);
635	// Call __hip_register_globals(GpuBinaryHandle);
636	if (RegisterGlobalsFunc) {
637	auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
638	CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
639	}
640	}
641	} else if (!RelocatableDeviceCode) {
642	// Register binary with CUDA runtime. This is substantially different in
643	// default mode vs. separate compilation!
644	// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
645	llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
646	RegisterFatbinFunc,
647	CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
648	GpuBinaryHandle = new llvm::GlobalVariable(
649	TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
650	llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
651	GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
652	CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
653	CGM.getPointerAlign());
654
655	// Call __cuda_register_globals(GpuBinaryHandle);
656	if (RegisterGlobalsFunc)
657	CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
658
659	// Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
660	if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
661	CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
662	// void __cudaRegisterFatBinaryEnd(void **);
663	llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
664	llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
665	"__cudaRegisterFatBinaryEnd");
666	CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
667	}
668	} else {
669	// Generate a unique module ID.
670	SmallString<64> ModuleID;
671	llvm::raw_svector_ostream OS(ModuleID);
672	OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
673	llvm::Constant *ModuleIDConstant =
674	makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32);
675
676	// Create an alias for the FatbinWrapper that nvcc will look for.
677	llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
678	Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
679
680	// void __cudaRegisterLinkedBinary%ModuleID%(void ()(void ), void *,
681	// void , void ()(void **))
682	SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
683	RegisterLinkedBinaryName += ModuleID;
684	llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
685	getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
686
687	(0) . __assert_fail ("RegisterGlobalsFunc && \"Expecting at least dummy function!\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 687, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
688	llvm::Value *Args[] = {RegisterGlobalsFunc,
689	CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
690	ModuleIDConstant,
691	makeDummyFunction(getCallbackFnTy())};
692	CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
693	}
694
695	// Create destructor and register it with atexit() the way NVCC does it. Doing
696	// it during regular destructor phase worked in CUDA before 9.2 but results in
697	// double-free in 9.2.
698	if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
699	// extern "C" int atexit(void (*f)(void));
700	llvm::FunctionType *AtExitTy =
701	llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
702	llvm::FunctionCallee AtExitFunc =
703	CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
704	/Local=/true);
705	CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
706	}
707
708	CtorBuilder.CreateRetVoid();
709	return ModuleCtorFunc;
710	}
711
712	/// Creates a global destructor function that unregisters the GPU code blob
713	/// registered by constructor.
714	///
715	/// For CUDA:
716	/// \code
717	/// void __cuda_module_dtor(void*) {
718	/// __cudaUnregisterFatBinary(Handle);
719	/// }
720	/// \endcode
721	///
722	/// For HIP:
723	/// \code
724	/// void __hip_module_dtor(void*) {
725	/// if (__hip_gpubin_handle) {
726	/// __hipUnregisterFatBinary(__hip_gpubin_handle);
727	/// __hip_gpubin_handle = 0;
728	/// }
729	/// }
730	/// \endcode
731	llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
732	// No need for destructor if we don't have a handle to unregister.
733	if (!GpuBinaryHandle)
734	return nullptr;
735
736	// void __cudaUnregisterFatBinary(void ** handle);
737	llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
738	llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
739	addUnderscoredPrefixToName("UnregisterFatBinary"));
740
741	llvm::Function *ModuleDtorFunc = llvm::Function::Create(
742	llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
743	llvm::GlobalValue::InternalLinkage,
744	addUnderscoredPrefixToName("_module_dtor"), &TheModule);
745
746	llvm::BasicBlock *DtorEntryBB =
747	llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
748	CGBuilderTy DtorBuilder(CGM, Context);
749	DtorBuilder.SetInsertPoint(DtorEntryBB);
750
751	Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
752	GpuBinaryHandle->getAlignment()));
753	auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
754	// There is only one HIP fat binary per linked module, however there are
755	// multiple destructor functions. Make sure the fat binary is unregistered
756	// only once.
757	if (CGM.getLangOpts().HIP) {
758	llvm::BasicBlock *IfBlock =
759	llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
760	llvm::BasicBlock *ExitBlock =
761	llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
762	llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
763	llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
764	DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
765
766	DtorBuilder.SetInsertPoint(IfBlock);
767	DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
768	DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
769	DtorBuilder.CreateBr(ExitBlock);
770
771	DtorBuilder.SetInsertPoint(ExitBlock);
772	} else {
773	DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
774	}
775	DtorBuilder.CreateRetVoid();
776	return ModuleDtorFunc;
777	}
778
779	CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
780	return new CGNVCUDARuntime(CGM);
781	}
782

Clang Project