Clang Project

clang_source_code/lib/CodeGen/CGCUDANV.cpp
1//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10// runtime library.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGCUDARuntime.h"
15#include "CodeGenFunction.h"
16#include "CodeGenModule.h"
17#include "clang/AST/Decl.h"
18#include "clang/Basic/Cuda.h"
19#include "clang/CodeGen/CodeGenABITypes.h"
20#include "clang/CodeGen/ConstantInitBuilder.h"
21#include "llvm/IR/BasicBlock.h"
22#include "llvm/IR/Constants.h"
23#include "llvm/IR/DerivedTypes.h"
24#include "llvm/Support/Format.h"
25
26using namespace clang;
27using namespace CodeGen;
28
29namespace {
30constexpr unsigned CudaFatMagic = 0x466243b1;
31constexpr unsigned HIPFatMagic = 0x48495046// "HIPF"
32
33class CGNVCUDARuntime : public CGCUDARuntime {
34
35private:
36  llvm::IntegerType *IntTy, *SizeTy;
37  llvm::Type *VoidTy;
38  llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
39
40  /// Convenience reference to LLVM Context
41  llvm::LLVMContext &Context;
42  /// Convenience reference to the current module
43  llvm::Module &TheModule;
44  /// Keeps track of kernel launch stubs emitted in this module
45  struct KernelInfo {
46    llvm::Function *Kernel;
47    const Decl *D;
48  };
49  llvm::SmallVector<KernelInfo16EmittedKernels;
50  struct VarInfo {
51    llvm::GlobalVariable *Var;
52    const VarDecl *D;
53    unsigned Flag;
54  };
55  llvm::SmallVector<VarInfo16DeviceVars;
56  /// Keeps track of variable containing handle of GPU binary. Populated by
57  /// ModuleCtorFunction() and used to create corresponding cleanup calls in
58  /// ModuleDtorFunction()
59  llvm::GlobalVariable *GpuBinaryHandle = nullptr;
60  /// Whether we generate relocatable device code.
61  bool RelocatableDeviceCode;
62  /// Mangle context for device.
63  std::unique_ptr<MangleContextDeviceMC;
64
65  llvm::FunctionCallee getSetupArgumentFn() const;
66  llvm::FunctionCallee getLaunchFn() const;
67
68  llvm::FunctionType *getRegisterGlobalsFnTy() const;
69  llvm::FunctionType *getCallbackFnTy() const;
70  llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
71  std::string addPrefixToName(StringRef FuncNameconst;
72  std::string addUnderscoredPrefixToName(StringRef FuncNameconst;
73
74  /// Creates a function to register all kernel stubs generated in this module.
75  llvm::Function *makeRegisterGlobalsFn();
76
77  /// Helper function that generates a constant string and returns a pointer to
78  /// the start of the string.  The result of this function can be used anywhere
79  /// where the C code specifies const char*.
80  llvm::Constant *makeConstantString(const std::string &Str,
81                                     const std::string &Name = "",
82                                     const std::string &SectionName = "",
83                                     unsigned Alignment = 0) {
84    llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
85                               llvm::ConstantInt::get(SizeTy, 0)};
86    auto ConstStr = CGM.GetAddrOfConstantCString(StrName.c_str());
87    llvm::GlobalVariable *GV =
88        cast<llvm::GlobalVariable>(ConstStr.getPointer());
89    if (!SectionName.empty()) {
90      GV->setSection(SectionName);
91      // Mark the address as used which make sure that this section isn't
92      // merged and we will really have it in the object file.
93      GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
94    }
95    if (Alignment)
96      GV->setAlignment(Alignment);
97
98    return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
99                                                ConstStr.getPointer(), Zeros);
100  }
101
102  /// Helper function that generates an empty dummy function returning void.
103  llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
104     (0) . __assert_fail ("FnTy->getReturnType()->isVoidTy() && \"Can only generate dummy functions returning void!\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 105, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(FnTy->getReturnType()->isVoidTy() &&
105 (0) . __assert_fail ("FnTy->getReturnType()->isVoidTy() && \"Can only generate dummy functions returning void!\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 105, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">           "Can only generate dummy functions returning void!");
106    llvm::Function *DummyFunc = llvm::Function::Create(
107        FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
108
109    llvm::BasicBlock *DummyBlock =
110        llvm::BasicBlock::Create(Context, "", DummyFunc);
111    CGBuilderTy FuncBuilder(CGMContext);
112    FuncBuilder.SetInsertPoint(DummyBlock);
113    FuncBuilder.CreateRetVoid();
114
115    return DummyFunc;
116  }
117
118  void emitDeviceStubBodyLegacy(CodeGenFunction &CGFFunctionArgList &Args);
119  void emitDeviceStubBodyNew(CodeGenFunction &CGFFunctionArgList &Args);
120  std::string getDeviceSideName(const Decl *ND);
121
122public:
123  CGNVCUDARuntime(CodeGenModule &CGM);
124
125  void emitDeviceStub(CodeGenFunction &CGFFunctionArgList &Args) override;
126  void registerDeviceVar(const VarDecl *VDllvm::GlobalVariable &Var,
127                         unsigned Flags) override {
128    DeviceVars.push_back({&Var, VD, Flags});
129  }
130
131  /// Creates module constructor function
132  llvm::Function *makeModuleCtorFunction() override;
133  /// Creates module destructor function
134  llvm::Function *makeModuleDtorFunction() override;
135};
136
137}
138
139std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncNameconst {
140  if (CGM.getLangOpts().HIP)
141    return ((Twine("hip") + Twine(FuncName)).str());
142  return ((Twine("cuda") + Twine(FuncName)).str());
143}
144std::string
145CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncNameconst {
146  if (CGM.getLangOpts().HIP)
147    return ((Twine("__hip") + Twine(FuncName)).str());
148  return ((Twine("__cuda") + Twine(FuncName)).str());
149}
150
151CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
152    : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
153      TheModule(CGM.getModule()),
154      RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
155      DeviceMC(CGM.getContext().createMangleContext(
156          CGM.getContext().getAuxTargetInfo())) {
157  CodeGen::CodeGenTypes &Types = CGM.getTypes();
158  ASTContext &Ctx = CGM.getContext();
159
160  IntTy = CGM.IntTy;
161  SizeTy = CGM.SizeTy;
162  VoidTy = CGM.VoidTy;
163
164  CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
165  VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
166  VoidPtrPtrTy = VoidPtrTy->getPointerTo();
167}
168
169llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
170  // cudaError_t cudaSetupArgument(void *, size_t, size_t)
171  llvm::Type *Params[] = {VoidPtrTySizeTySizeTy};
172  return CGM.CreateRuntimeFunction(
173      llvm::FunctionType::get(IntTy, Params, false),
174      addPrefixToName("SetupArgument"));
175}
176
177llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
178  if (CGM.getLangOpts().HIP) {
179    // hipError_t hipLaunchByPtr(char *);
180    return CGM.CreateRuntimeFunction(
181        llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
182  } else {
183    // cudaError_t cudaLaunch(char *);
184    return CGM.CreateRuntimeFunction(
185        llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
186  }
187}
188
189llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
190  return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
191}
192
193llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
194  return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
195}
196
197llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
198  auto CallbackFnTy = getCallbackFnTy();
199  auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
200  llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
201                          VoidPtrTy, CallbackFnTy->getPointerTo()};
202  return llvm::FunctionType::get(VoidTy, Params, false);
203}
204
205std::string CGNVCUDARuntime::getDeviceSideName(const Decl *D) {
206  auto *ND = cast<const NamedDecl>(D);
207  std::string DeviceSideName;
208  if (DeviceMC->shouldMangleDeclName(ND)) {
209    SmallString<256Buffer;
210    llvm::raw_svector_ostream Out(Buffer);
211    DeviceMC->mangleName(ND, Out);
212    DeviceSideName = Out.str();
213  } else
214    DeviceSideName = ND->getIdentifier()->getName();
215  return DeviceSideName;
216}
217
218void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
219                                     FunctionArgList &Args) {
220  getName() || CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()) ? static_cast (0) . __assert_fail ("getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() || getDeviceSideName(CGF.CurFuncDecl) + \".stub\" == CGF.CurFn->getName() || CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 223, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() ||
221getName() || CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()) ? static_cast (0) . __assert_fail ("getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() || getDeviceSideName(CGF.CurFuncDecl) + \".stub\" == CGF.CurFn->getName() || CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 223, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">         getDeviceSideName(CGF.CurFuncDecl) + ".stub" == CGF.CurFn->getName() ||
222getName() || CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()) ? static_cast (0) . __assert_fail ("getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() || getDeviceSideName(CGF.CurFuncDecl) + \".stub\" == CGF.CurFn->getName() || CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 223, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">         CGF.CGM.getContext().getTargetInfo().getCXXABI() !=
223getName() || CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()) ? static_cast (0) . __assert_fail ("getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() || getDeviceSideName(CGF.CurFuncDecl) + \".stub\" == CGF.CurFn->getName() || CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 223, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">             CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI());
224
225  EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
226  if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
227                         CudaFeature::CUDA_USES_NEW_LAUNCH))
228    emitDeviceStubBodyNew(CGFArgs);
229  else
230    emitDeviceStubBodyLegacy(CGFArgs);
231}
232
233// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
234// array and kernels are launched using cudaLaunchKernel().
235void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
236                                            FunctionArgList &Args) {
237  // Build the shadow stack entry at the very start of the function.
238
239  // Calculate amount of space we will need for all arguments.  If we have no
240  // args, allocate a single pointer so we still have a valid pointer to the
241  // argument array that we can pass to runtime, even if it will be unused.
242  Address KernelArgs = CGF.CreateTempAlloca(
243      VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
244      llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
245  // Store pointers to the arguments in a locally allocated launch_args.
246  for (unsigned i = 0i < Args.size(); ++i) {
247    llvm::ValueVarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
248    llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtrVoidPtrTy);
249    CGF.Builder.CreateDefaultAlignedStore(
250        VoidVarPtrCGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
251  }
252
253  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
254
255  // Lookup cudaLaunchKernel function.
256  // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
257  //                              void **args, size_t sharedMem,
258  //                              cudaStream_t stream);
259  TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
260  DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
261  IdentifierInfo &cudaLaunchKernelII =
262      CGM.getContext().Idents.get("cudaLaunchKernel");
263  FunctionDecl *cudaLaunchKernelFD = nullptr;
264  for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
265    if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
266      cudaLaunchKernelFD = FD;
267  }
268
269  if (cudaLaunchKernelFD == nullptr) {
270    CGM.Error(CGF.CurFuncDecl->getLocation(),
271              "Can't find declaration for cudaLaunchKernel()");
272    return;
273  }
274  // Create temporary dim3 grid_dim, block_dim.
275  ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
276  QualType Dim3Ty = GridDimParam->getType();
277  Address GridDim =
278      CGF.CreateMemTemp(Dim3TyCharUnits::fromQuantity(8), "grid_dim");
279  Address BlockDim =
280      CGF.CreateMemTemp(Dim3TyCharUnits::fromQuantity(8), "block_dim");
281  Address ShmemSize =
282      CGF.CreateTempAlloca(SizeTyCGM.getSizeAlign(), "shmem_size");
283  Address Stream =
284      CGF.CreateTempAlloca(VoidPtrTyCGM.getPointerAlign(), "stream");
285  llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
286      llvm::FunctionType::get(IntTy,
287                              {/*gridDim=*/GridDim.getType(),
288                               /*blockDim=*/BlockDim.getType(),
289                               /*ShmemSize=*/ShmemSize.getType(),
290                               /*Stream=*/Stream.getType()},
291                              /*isVarArg=*/false),
292      "__cudaPopCallConfiguration");
293
294  CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
295                              {GridDim.getPointer(), BlockDim.getPointer(),
296                               ShmemSize.getPointer(), Stream.getPointer()});
297
298  // Emit the call to cudaLaunch
299  llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFnVoidPtrTy);
300  CallArgList LaunchKernelArgs;
301  LaunchKernelArgs.add(RValue::get(Kernel),
302                       cudaLaunchKernelFD->getParamDecl(0)->getType());
303  LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
304  LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
305  LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
306                       cudaLaunchKernelFD->getParamDecl(3)->getType());
307  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
308                       cudaLaunchKernelFD->getParamDecl(4)->getType());
309  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
310                       cudaLaunchKernelFD->getParamDecl(5)->getType());
311
312  QualType QT = cudaLaunchKernelFD->getType();
313  QualType CQT = QT.getCanonicalType();
314  llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
315  llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
316
317  const CGFunctionInfo &FI =
318      CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
319  llvm::FunctionCallee cudaLaunchKernelFn =
320      CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel");
321  CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
322               LaunchKernelArgs);
323  CGF.EmitBranch(EndBlock);
324
325  CGF.EmitBlock(EndBlock);
326}
327
328void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
329                                               FunctionArgList &Args) {
330  // Emit a call to cudaSetupArgument for each arg in Args.
331  llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
332  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
333  CharUnits Offset = CharUnits::Zero();
334  for (const VarDecl *A : Args) {
335    CharUnits TyWidth, TyAlign;
336    std::tie(TyWidth, TyAlign) =
337        CGM.getContext().getTypeInfoInChars(A->getType());
338    Offset = Offset.alignTo(TyAlign);
339    llvm::Value *Args[] = {
340        CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
341                                      VoidPtrTy),
342        llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()),
343        llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
344    };
345    llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
346    llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
347    llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
348    llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
349    CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
350    CGF.EmitBlock(NextBlock);
351    Offset += TyWidth;
352  }
353
354  // Emit the call to cudaLaunch
355  llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
356  llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFnCharPtrTy);
357  CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
358  CGF.EmitBranch(EndBlock);
359
360  CGF.EmitBlock(EndBlock);
361}
362
363/// Creates a function that sets up state on the host side for CUDA objects that
364/// have a presence on both the host and device sides. Specifically, registers
365/// the host side of kernel functions and device global variables with the CUDA
366/// runtime.
367/// \code
368/// void __cuda_register_globals(void** GpuBinaryHandle) {
369///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
370///    ...
371///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
372///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
373///    ...
374///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
375/// }
376/// \endcode
377llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
378  // No need to register anything
379  if (EmittedKernels.empty() && DeviceVars.empty())
380    return nullptr;
381
382  llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
383      getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
384      addUnderscoredPrefixToName("_register_globals"), &TheModule);
385  llvm::BasicBlock *EntryBB =
386      llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
387  CGBuilderTy Builder(CGMContext);
388  Builder.SetInsertPoint(EntryBB);
389
390  // void __cudaRegisterFunction(void **, const char *, char *, const char *,
391  //                             int, uint3*, uint3*, dim3*, dim3*, int*)
392  llvm::Type *RegisterFuncParams[] = {
393      VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
394      VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
395  llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
396      llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
397      addUnderscoredPrefixToName("RegisterFunction"));
398
399  // Extract GpuBinaryHandle passed as the first argument passed to
400  // __cuda_register_globals() and generate __cudaRegisterFunction() call for
401  // each emitted kernel.
402  llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
403  for (auto &&I : EmittedKernels) {
404    llvm::Constant *KernelName = makeConstantString(getDeviceSideName(I.D));
405    llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
406    llvm::Value *Args[] = {
407        &GpuBinaryHandlePtr,
408        Builder.CreateBitCast(I.Kernel, VoidPtrTy),
409        KernelName,
410        KernelName,
411        llvm::ConstantInt::get(IntTy, -1),
412        NullPtr,
413        NullPtr,
414        NullPtr,
415        NullPtr,
416        llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
417    Builder.CreateCall(RegisterFunc, Args);
418  }
419
420  // void __cudaRegisterVar(void **, char *, char *, const char *,
421  //                        int, int, int, int)
422  llvm::Type *RegisterVarParams[] = {VoidPtrPtrTyCharPtrTyCharPtrTy,
423                                     CharPtrTy,    IntTy,     IntTy,
424                                     IntTy,        IntTy};
425  llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
426      llvm::FunctionType::get(IntTy, RegisterVarParams, false),
427      addUnderscoredPrefixToName("RegisterVar"));
428  for (auto &&Info : DeviceVars) {
429    llvm::GlobalVariable *Var = Info.Var;
430    unsigned Flags = Info.Flag;
431    llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
432    uint64_t VarSize =
433        CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
434    llvm::Value *Args[] = {
435        &GpuBinaryHandlePtr,
436        Builder.CreateBitCast(Var, VoidPtrTy),
437        VarName,
438        VarName,
439        llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0),
440        llvm::ConstantInt::get(IntTy, VarSize),
441        llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0),
442        llvm::ConstantInt::get(IntTy, 0)};
443    Builder.CreateCall(RegisterVar, Args);
444  }
445
446  Builder.CreateRetVoid();
447  return RegisterKernelsFunc;
448}
449
450/// Creates a global constructor function for the module:
451///
452/// For CUDA:
453/// \code
454/// void __cuda_module_ctor(void*) {
455///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
456///     __cuda_register_globals(Handle);
457/// }
458/// \endcode
459///
460/// For HIP:
461/// \code
462/// void __hip_module_ctor(void*) {
463///     if (__hip_gpubin_handle == 0) {
464///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
465///         __hip_register_globals(__hip_gpubin_handle);
466///     }
467/// }
468/// \endcode
469llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
470  bool IsHIP = CGM.getLangOpts().HIP;
471  // No need to generate ctors/dtors if there is no GPU binary.
472  StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
473  if (CudaGpuBinaryFileName.empty() && !IsHIP)
474    return nullptr;
475
476  // void __{cuda|hip}_register_globals(void* handle);
477  llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
478  // We always need a function to pass in as callback. Create a dummy
479  // implementation if we don't need to register anything.
480  if (RelocatableDeviceCode && !RegisterGlobalsFunc)
481    RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
482
483  // void ** __{cuda|hip}RegisterFatBinary(void *);
484  llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
485      llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
486      addUnderscoredPrefixToName("RegisterFatBinary"));
487  // struct { int magic, int version, void * gpu_binary, void * dont_care };
488  llvm::StructType *FatbinWrapperTy =
489      llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
490
491  // Register GPU binary with the CUDA runtime, store returned handle in a
492  // global variable and save a reference in GpuBinaryHandle to be cleaned up
493  // in destructor on exit. Then associate all known kernels with the GPU binary
494  // handle so CUDA runtime can figure out what to call on the GPU side.
495  std::unique_ptr<llvm::MemoryBufferCudaGpuBinary = nullptr;
496  if (!CudaGpuBinaryFileName.empty()) {
497    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
498        llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
499    if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
500      CGM.getDiags().Report(diag::err_cannot_open_file)
501          << CudaGpuBinaryFileName << EC.message();
502      return nullptr;
503    }
504    CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
505  }
506
507  llvm::Function *ModuleCtorFunc = llvm::Function::Create(
508      llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
509      llvm::GlobalValue::InternalLinkage,
510      addUnderscoredPrefixToName("_module_ctor"), &TheModule);
511  llvm::BasicBlock *CtorEntryBB =
512      llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
513  CGBuilderTy CtorBuilder(CGMContext);
514
515  CtorBuilder.SetInsertPoint(CtorEntryBB);
516
517  const char *FatbinConstantName;
518  const char *FatbinSectionName;
519  const char *ModuleIDSectionName;
520  StringRef ModuleIDPrefix;
521  llvm::Constant *FatBinStr;
522  unsigned FatMagic;
523  if (IsHIP) {
524    FatbinConstantName = ".hip_fatbin";
525    FatbinSectionName = ".hipFatBinSegment";
526
527    ModuleIDSectionName = "__hip_module_id";
528    ModuleIDPrefix = "__hip_";
529
530    if (CudaGpuBinary) {
531      // If fatbin is available from early finalization, create a string
532      // literal containing the fat binary loaded from the given file.
533      FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
534                                     FatbinConstantName, 8);
535    } else {
536      // If fatbin is not available, create an external symbol
537      // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
538      // to contain the fat binary but will be populated somewhere else,
539      // e.g. by lld through link script.
540      FatBinStr = new llvm::GlobalVariable(
541        CGM.getModule(), CGM.Int8Ty,
542        /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
543        "__hip_fatbin"nullptr,
544        llvm::GlobalVariable::NotThreadLocal);
545      cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
546    }
547
548    FatMagic = HIPFatMagic;
549  } else {
550    if (RelocatableDeviceCode)
551      FatbinConstantName = CGM.getTriple().isMacOSX()
552                               ? "__NV_CUDA,__nv_relfatbin"
553                               : "__nv_relfatbin";
554    else
555      FatbinConstantName =
556          CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
557    // NVIDIA's cuobjdump looks for fatbins in this section.
558    FatbinSectionName =
559        CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
560
561    ModuleIDSectionName = CGM.getTriple().isMacOSX()
562                              ? "__NV_CUDA,__nv_module_id"
563                              : "__nv_module_id";
564    ModuleIDPrefix = "__nv_";
565
566    // For CUDA, create a string literal containing the fat binary loaded from
567    // the given file.
568    FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
569                                   FatbinConstantName, 8);
570    FatMagic = CudaFatMagic;
571  }
572
573  // Create initialized wrapper structure that points to the loaded GPU binary
574  ConstantInitBuilder Builder(CGM);
575  auto Values = Builder.beginStruct(FatbinWrapperTy);
576  // Fatbin wrapper magic.
577  Values.addInt(IntTy, FatMagic);
578  // Fatbin version.
579  Values.addInt(IntTy, 1);
580  // Data.
581  Values.add(FatBinStr);
582  // Unused in fatbin v1.
583  Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
584  llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
585      addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
586      /*constant*/ true);
587  FatbinWrapper->setSection(FatbinSectionName);
588
589  // There is only one HIP fat binary per linked module, however there are
590  // multiple constructor functions. Make sure the fat binary is registered
591  // only once. The constructor functions are executed by the dynamic loader
592  // before the program gains control. The dynamic loader cannot execute the
593  // constructor functions concurrently since doing that would not guarantee
594  // thread safety of the loaded program. Therefore we can assume sequential
595  // execution of constructor functions here.
596  if (IsHIP) {
597    auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
598        llvm::GlobalValue::LinkOnceAnyLinkage;
599    llvm::BasicBlock *IfBlock =
600        llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
601    llvm::BasicBlock *ExitBlock =
602        llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
603    // The name, size, and initialization pattern of this variable is part
604    // of HIP ABI.
605    GpuBinaryHandle = new llvm::GlobalVariable(
606        TheModule, VoidPtrPtrTy, /*isConstant=*/false,
607        Linkage,
608        /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
609        "__hip_gpubin_handle");
610    GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
611    // Prevent the weak symbol in different shared libraries being merged.
612    if (Linkage != llvm::GlobalValue::InternalLinkage)
613      GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
614    Address GpuBinaryAddr(
615        GpuBinaryHandle,
616        CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
617    {
618      auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
619      llvm::Constant *Zero =
620          llvm::Constant::getNullValue(HandleValue->getType());
621      llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
622      CtorBuilder.CreateCondBr(EQZeroIfBlockExitBlock);
623    }
624    {
625      CtorBuilder.SetInsertPoint(IfBlock);
626      // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
627      llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
628          RegisterFatbinFunc,
629          CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
630      CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
631      CtorBuilder.CreateBr(ExitBlock);
632    }
633    {
634      CtorBuilder.SetInsertPoint(ExitBlock);
635      // Call __hip_register_globals(GpuBinaryHandle);
636      if (RegisterGlobalsFunc) {
637        auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
638        CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
639      }
640    }
641  } else if (!RelocatableDeviceCode) {
642    // Register binary with CUDA runtime. This is substantially different in
643    // default mode vs. separate compilation!
644    // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
645    llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
646        RegisterFatbinFunc,
647        CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
648    GpuBinaryHandle = new llvm::GlobalVariable(
649        TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
650        llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
651    GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
652    CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
653                                   CGM.getPointerAlign());
654
655    // Call __cuda_register_globals(GpuBinaryHandle);
656    if (RegisterGlobalsFunc)
657      CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
658
659    // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
660    if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
661                           CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
662      // void __cudaRegisterFatBinaryEnd(void **);
663      llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
664          llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
665          "__cudaRegisterFatBinaryEnd");
666      CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
667    }
668  } else {
669    // Generate a unique module ID.
670    SmallString<64ModuleID;
671    llvm::raw_svector_ostream OS(ModuleID);
672    OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
673    llvm::Constant *ModuleIDConstant =
674        makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32);
675
676    // Create an alias for the FatbinWrapper that nvcc will look for.
677    llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
678                              Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
679
680    // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
681    // void *, void (*)(void **))
682    SmallString<128RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
683    RegisterLinkedBinaryName += ModuleID;
684    llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
685        getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
686
687     (0) . __assert_fail ("RegisterGlobalsFunc && \"Expecting at least dummy function!\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 687, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
688    llvm::Value *Args[] = {RegisterGlobalsFunc,
689                           CtorBuilder.CreateBitCast(FatbinWrapperVoidPtrTy),
690                           ModuleIDConstant,
691                           makeDummyFunction(getCallbackFnTy())};
692    CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
693  }
694
695  // Create destructor and register it with atexit() the way NVCC does it. Doing
696  // it during regular destructor phase worked in CUDA before 9.2 but results in
697  // double-free in 9.2.
698  if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
699    // extern "C" int atexit(void (*f)(void));
700    llvm::FunctionType *AtExitTy =
701        llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
702    llvm::FunctionCallee AtExitFunc =
703        CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
704                                  /*Local=*/true);
705    CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
706  }
707
708  CtorBuilder.CreateRetVoid();
709  return ModuleCtorFunc;
710}
711
712/// Creates a global destructor function that unregisters the GPU code blob
713/// registered by constructor.
714///
715/// For CUDA:
716/// \code
717/// void __cuda_module_dtor(void*) {
718///     __cudaUnregisterFatBinary(Handle);
719/// }
720/// \endcode
721///
722/// For HIP:
723/// \code
724/// void __hip_module_dtor(void*) {
725///     if (__hip_gpubin_handle) {
726///         __hipUnregisterFatBinary(__hip_gpubin_handle);
727///         __hip_gpubin_handle = 0;
728///     }
729/// }
730/// \endcode
731llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
732  // No need for destructor if we don't have a handle to unregister.
733  if (!GpuBinaryHandle)
734    return nullptr;
735
736  // void __cudaUnregisterFatBinary(void ** handle);
737  llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
738      llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
739      addUnderscoredPrefixToName("UnregisterFatBinary"));
740
741  llvm::Function *ModuleDtorFunc = llvm::Function::Create(
742      llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
743      llvm::GlobalValue::InternalLinkage,
744      addUnderscoredPrefixToName("_module_dtor"), &TheModule);
745
746  llvm::BasicBlock *DtorEntryBB =
747      llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
748  CGBuilderTy DtorBuilder(CGMContext);
749  DtorBuilder.SetInsertPoint(DtorEntryBB);
750
751  Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
752                                             GpuBinaryHandle->getAlignment()));
753  auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
754  // There is only one HIP fat binary per linked module, however there are
755  // multiple destructor functions. Make sure the fat binary is unregistered
756  // only once.
757  if (CGM.getLangOpts().HIP) {
758    llvm::BasicBlock *IfBlock =
759        llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
760    llvm::BasicBlock *ExitBlock =
761        llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
762    llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
763    llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
764    DtorBuilder.CreateCondBr(NEZeroIfBlockExitBlock);
765
766    DtorBuilder.SetInsertPoint(IfBlock);
767    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
768    DtorBuilder.CreateStore(ZeroGpuBinaryAddr);
769    DtorBuilder.CreateBr(ExitBlock);
770
771    DtorBuilder.SetInsertPoint(ExitBlock);
772  } else {
773    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
774  }
775  DtorBuilder.CreateRetVoid();
776  return ModuleDtorFunc;
777}
778
779CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
780  return new CGNVCUDARuntime(CGM);
781}
782