1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | #include "CGCUDARuntime.h" |
15 | #include "CodeGenFunction.h" |
16 | #include "CodeGenModule.h" |
17 | #include "clang/AST/Decl.h" |
18 | #include "clang/Basic/Cuda.h" |
19 | #include "clang/CodeGen/CodeGenABITypes.h" |
20 | #include "clang/CodeGen/ConstantInitBuilder.h" |
21 | #include "llvm/IR/BasicBlock.h" |
22 | #include "llvm/IR/Constants.h" |
23 | #include "llvm/IR/DerivedTypes.h" |
24 | #include "llvm/Support/Format.h" |
25 | |
26 | using namespace clang; |
27 | using namespace CodeGen; |
28 | |
29 | namespace { |
30 | constexpr unsigned CudaFatMagic = 0x466243b1; |
31 | constexpr unsigned HIPFatMagic = 0x48495046; |
32 | |
33 | class CGNVCUDARuntime : public CGCUDARuntime { |
34 | |
35 | private: |
36 | llvm::IntegerType *IntTy, *SizeTy; |
37 | llvm::Type *VoidTy; |
38 | llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; |
39 | |
40 | |
41 | llvm::LLVMContext &Context; |
42 | |
43 | llvm::Module &TheModule; |
44 | |
45 | struct KernelInfo { |
46 | llvm::Function *Kernel; |
47 | const Decl *D; |
48 | }; |
49 | llvm::SmallVector<KernelInfo, 16> EmittedKernels; |
50 | struct VarInfo { |
51 | llvm::GlobalVariable *Var; |
52 | const VarDecl *D; |
53 | unsigned Flag; |
54 | }; |
55 | llvm::SmallVector<VarInfo, 16> DeviceVars; |
56 | |
57 | |
58 | |
59 | llvm::GlobalVariable *GpuBinaryHandle = nullptr; |
60 | |
61 | bool RelocatableDeviceCode; |
62 | |
63 | std::unique_ptr<MangleContext> DeviceMC; |
64 | |
65 | llvm::FunctionCallee getSetupArgumentFn() const; |
66 | llvm::FunctionCallee getLaunchFn() const; |
67 | |
68 | llvm::FunctionType *getRegisterGlobalsFnTy() const; |
69 | llvm::FunctionType *getCallbackFnTy() const; |
70 | llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; |
71 | std::string addPrefixToName(StringRef FuncName) const; |
72 | std::string addUnderscoredPrefixToName(StringRef FuncName) const; |
73 | |
74 | |
75 | llvm::Function *makeRegisterGlobalsFn(); |
76 | |
77 | |
78 | |
79 | |
80 | llvm::Constant *makeConstantString(const std::string &Str, |
81 | const std::string &Name = "", |
82 | const std::string &SectionName = "", |
83 | unsigned Alignment = 0) { |
84 | llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), |
85 | llvm::ConstantInt::get(SizeTy, 0)}; |
86 | auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); |
87 | llvm::GlobalVariable *GV = |
88 | cast<llvm::GlobalVariable>(ConstStr.getPointer()); |
89 | if (!SectionName.empty()) { |
90 | GV->setSection(SectionName); |
91 | |
92 | |
93 | GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); |
94 | } |
95 | if (Alignment) |
96 | GV->setAlignment(Alignment); |
97 | |
98 | return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), |
99 | ConstStr.getPointer(), Zeros); |
100 | } |
101 | |
102 | |
103 | llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { |
104 | (0) . __assert_fail ("FnTy->getReturnType()->isVoidTy() && \"Can only generate dummy functions returning void!\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 105, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(FnTy->getReturnType()->isVoidTy() && |
105 | (0) . __assert_fail ("FnTy->getReturnType()->isVoidTy() && \"Can only generate dummy functions returning void!\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 105, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> "Can only generate dummy functions returning void!"); |
106 | llvm::Function *DummyFunc = llvm::Function::Create( |
107 | FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); |
108 | |
109 | llvm::BasicBlock *DummyBlock = |
110 | llvm::BasicBlock::Create(Context, "", DummyFunc); |
111 | CGBuilderTy FuncBuilder(CGM, Context); |
112 | FuncBuilder.SetInsertPoint(DummyBlock); |
113 | FuncBuilder.CreateRetVoid(); |
114 | |
115 | return DummyFunc; |
116 | } |
117 | |
118 | void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); |
119 | void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); |
120 | std::string getDeviceSideName(const Decl *ND); |
121 | |
122 | public: |
123 | CGNVCUDARuntime(CodeGenModule &CGM); |
124 | |
125 | void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; |
126 | void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, |
127 | unsigned Flags) override { |
128 | DeviceVars.push_back({&Var, VD, Flags}); |
129 | } |
130 | |
131 | |
132 | llvm::Function *makeModuleCtorFunction() override; |
133 | |
134 | llvm::Function *makeModuleDtorFunction() override; |
135 | }; |
136 | |
137 | } |
138 | |
139 | std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { |
140 | if (CGM.getLangOpts().HIP) |
141 | return ((Twine("hip") + Twine(FuncName)).str()); |
142 | return ((Twine("cuda") + Twine(FuncName)).str()); |
143 | } |
144 | std::string |
145 | CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { |
146 | if (CGM.getLangOpts().HIP) |
147 | return ((Twine("__hip") + Twine(FuncName)).str()); |
148 | return ((Twine("__cuda") + Twine(FuncName)).str()); |
149 | } |
150 | |
151 | CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) |
152 | : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), |
153 | TheModule(CGM.getModule()), |
154 | RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), |
155 | DeviceMC(CGM.getContext().createMangleContext( |
156 | CGM.getContext().getAuxTargetInfo())) { |
157 | CodeGen::CodeGenTypes &Types = CGM.getTypes(); |
158 | ASTContext &Ctx = CGM.getContext(); |
159 | |
160 | IntTy = CGM.IntTy; |
161 | SizeTy = CGM.SizeTy; |
162 | VoidTy = CGM.VoidTy; |
163 | |
164 | CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); |
165 | VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); |
166 | VoidPtrPtrTy = VoidPtrTy->getPointerTo(); |
167 | } |
168 | |
169 | llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { |
170 | |
171 | llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; |
172 | return CGM.CreateRuntimeFunction( |
173 | llvm::FunctionType::get(IntTy, Params, false), |
174 | addPrefixToName("SetupArgument")); |
175 | } |
176 | |
177 | llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { |
178 | if (CGM.getLangOpts().HIP) { |
179 | |
180 | return CGM.CreateRuntimeFunction( |
181 | llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr"); |
182 | } else { |
183 | |
184 | return CGM.CreateRuntimeFunction( |
185 | llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); |
186 | } |
187 | } |
188 | |
189 | llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { |
190 | return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); |
191 | } |
192 | |
193 | llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { |
194 | return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); |
195 | } |
196 | |
197 | llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { |
198 | auto CallbackFnTy = getCallbackFnTy(); |
199 | auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); |
200 | llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy, |
201 | VoidPtrTy, CallbackFnTy->getPointerTo()}; |
202 | return llvm::FunctionType::get(VoidTy, Params, false); |
203 | } |
204 | |
205 | std::string CGNVCUDARuntime::getDeviceSideName(const Decl *D) { |
206 | auto *ND = cast<const NamedDecl>(D); |
207 | std::string DeviceSideName; |
208 | if (DeviceMC->shouldMangleDeclName(ND)) { |
209 | SmallString<256> Buffer; |
210 | llvm::raw_svector_ostream Out(Buffer); |
211 | DeviceMC->mangleName(ND, Out); |
212 | DeviceSideName = Out.str(); |
213 | } else |
214 | DeviceSideName = ND->getIdentifier()->getName(); |
215 | return DeviceSideName; |
216 | } |
217 | |
218 | void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, |
219 | FunctionArgList &Args) { |
220 | getName() || CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()) ? static_cast (0) . __assert_fail ("getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() || getDeviceSideName(CGF.CurFuncDecl) + \".stub\" == CGF.CurFn->getName() || CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 223, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() || |
221 | getName() || CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()) ? static_cast (0) . __assert_fail ("getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() || getDeviceSideName(CGF.CurFuncDecl) + \".stub\" == CGF.CurFn->getName() || CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 223, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> getDeviceSideName(CGF.CurFuncDecl) + ".stub" == CGF.CurFn->getName() || |
222 | getName() || CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()) ? static_cast (0) . __assert_fail ("getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() || getDeviceSideName(CGF.CurFuncDecl) + \".stub\" == CGF.CurFn->getName() || CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 223, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> CGF.CGM.getContext().getTargetInfo().getCXXABI() != |
223 | getName() || CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()) ? static_cast (0) . __assert_fail ("getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() || getDeviceSideName(CGF.CurFuncDecl) + \".stub\" == CGF.CurFn->getName() || CGF.CGM.getContext().getTargetInfo().getCXXABI() != CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 223, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true"> CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()); |
224 | |
225 | EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); |
226 | if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), |
227 | CudaFeature::CUDA_USES_NEW_LAUNCH)) |
228 | emitDeviceStubBodyNew(CGF, Args); |
229 | else |
230 | emitDeviceStubBodyLegacy(CGF, Args); |
231 | } |
232 | |
233 | |
234 | |
235 | void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, |
236 | FunctionArgList &Args) { |
237 | |
238 | |
239 | |
240 | |
241 | |
242 | Address KernelArgs = CGF.CreateTempAlloca( |
243 | VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", |
244 | llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); |
245 | |
246 | for (unsigned i = 0; i < Args.size(); ++i) { |
247 | llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); |
248 | llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); |
249 | CGF.Builder.CreateDefaultAlignedStore( |
250 | VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i)); |
251 | } |
252 | |
253 | llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); |
254 | |
255 | |
256 | |
257 | |
258 | |
259 | TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); |
260 | DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); |
261 | IdentifierInfo &cudaLaunchKernelII = |
262 | CGM.getContext().Idents.get("cudaLaunchKernel"); |
263 | FunctionDecl *cudaLaunchKernelFD = nullptr; |
264 | for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { |
265 | if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) |
266 | cudaLaunchKernelFD = FD; |
267 | } |
268 | |
269 | if (cudaLaunchKernelFD == nullptr) { |
270 | CGM.Error(CGF.CurFuncDecl->getLocation(), |
271 | "Can't find declaration for cudaLaunchKernel()"); |
272 | return; |
273 | } |
274 | |
275 | ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); |
276 | QualType Dim3Ty = GridDimParam->getType(); |
277 | Address GridDim = |
278 | CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); |
279 | Address BlockDim = |
280 | CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); |
281 | Address ShmemSize = |
282 | CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); |
283 | Address Stream = |
284 | CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); |
285 | llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( |
286 | llvm::FunctionType::get(IntTy, |
287 | {GridDim.getType(), |
288 | BlockDim.getType(), |
289 | ShmemSize.getType(), |
290 | Stream.getType()}, |
291 | ), |
292 | "__cudaPopCallConfiguration"); |
293 | |
294 | CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, |
295 | {GridDim.getPointer(), BlockDim.getPointer(), |
296 | ShmemSize.getPointer(), Stream.getPointer()}); |
297 | |
298 | |
299 | llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); |
300 | CallArgList LaunchKernelArgs; |
301 | LaunchKernelArgs.add(RValue::get(Kernel), |
302 | cudaLaunchKernelFD->getParamDecl(0)->getType()); |
303 | LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); |
304 | LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); |
305 | LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), |
306 | cudaLaunchKernelFD->getParamDecl(3)->getType()); |
307 | LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), |
308 | cudaLaunchKernelFD->getParamDecl(4)->getType()); |
309 | LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), |
310 | cudaLaunchKernelFD->getParamDecl(5)->getType()); |
311 | |
312 | QualType QT = cudaLaunchKernelFD->getType(); |
313 | QualType CQT = QT.getCanonicalType(); |
314 | llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); |
315 | llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty); |
316 | |
317 | const CGFunctionInfo &FI = |
318 | CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); |
319 | llvm::FunctionCallee cudaLaunchKernelFn = |
320 | CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel"); |
321 | CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), |
322 | LaunchKernelArgs); |
323 | CGF.EmitBranch(EndBlock); |
324 | |
325 | CGF.EmitBlock(EndBlock); |
326 | } |
327 | |
328 | void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, |
329 | FunctionArgList &Args) { |
330 | |
331 | llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); |
332 | llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); |
333 | CharUnits Offset = CharUnits::Zero(); |
334 | for (const VarDecl *A : Args) { |
335 | CharUnits TyWidth, TyAlign; |
336 | std::tie(TyWidth, TyAlign) = |
337 | CGM.getContext().getTypeInfoInChars(A->getType()); |
338 | Offset = Offset.alignTo(TyAlign); |
339 | llvm::Value *Args[] = { |
340 | CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), |
341 | VoidPtrTy), |
342 | llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), |
343 | llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), |
344 | }; |
345 | llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); |
346 | llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); |
347 | llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); |
348 | llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); |
349 | CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); |
350 | CGF.EmitBlock(NextBlock); |
351 | Offset += TyWidth; |
352 | } |
353 | |
354 | |
355 | llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); |
356 | llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); |
357 | CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); |
358 | CGF.EmitBranch(EndBlock); |
359 | |
360 | CGF.EmitBlock(EndBlock); |
361 | } |
362 | |
363 | |
364 | |
365 | |
366 | |
367 | |
368 | |
369 | |
370 | |
371 | |
372 | |
373 | |
374 | |
375 | |
376 | |
377 | llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { |
378 | |
379 | if (EmittedKernels.empty() && DeviceVars.empty()) |
380 | return nullptr; |
381 | |
382 | llvm::Function *RegisterKernelsFunc = llvm::Function::Create( |
383 | getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, |
384 | addUnderscoredPrefixToName("_register_globals"), &TheModule); |
385 | llvm::BasicBlock *EntryBB = |
386 | llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); |
387 | CGBuilderTy Builder(CGM, Context); |
388 | Builder.SetInsertPoint(EntryBB); |
389 | |
390 | |
391 | |
392 | llvm::Type *RegisterFuncParams[] = { |
393 | VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, |
394 | VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; |
395 | llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( |
396 | llvm::FunctionType::get(IntTy, RegisterFuncParams, false), |
397 | addUnderscoredPrefixToName("RegisterFunction")); |
398 | |
399 | |
400 | |
401 | |
402 | llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); |
403 | for (auto &&I : EmittedKernels) { |
404 | llvm::Constant *KernelName = makeConstantString(getDeviceSideName(I.D)); |
405 | llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); |
406 | llvm::Value *Args[] = { |
407 | &GpuBinaryHandlePtr, |
408 | Builder.CreateBitCast(I.Kernel, VoidPtrTy), |
409 | KernelName, |
410 | KernelName, |
411 | llvm::ConstantInt::get(IntTy, -1), |
412 | NullPtr, |
413 | NullPtr, |
414 | NullPtr, |
415 | NullPtr, |
416 | llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; |
417 | Builder.CreateCall(RegisterFunc, Args); |
418 | } |
419 | |
420 | |
421 | |
422 | llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, |
423 | CharPtrTy, IntTy, IntTy, |
424 | IntTy, IntTy}; |
425 | llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( |
426 | llvm::FunctionType::get(IntTy, RegisterVarParams, false), |
427 | addUnderscoredPrefixToName("RegisterVar")); |
428 | for (auto &&Info : DeviceVars) { |
429 | llvm::GlobalVariable *Var = Info.Var; |
430 | unsigned Flags = Info.Flag; |
431 | llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); |
432 | uint64_t VarSize = |
433 | CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); |
434 | llvm::Value *Args[] = { |
435 | &GpuBinaryHandlePtr, |
436 | Builder.CreateBitCast(Var, VoidPtrTy), |
437 | VarName, |
438 | VarName, |
439 | llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), |
440 | llvm::ConstantInt::get(IntTy, VarSize), |
441 | llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), |
442 | llvm::ConstantInt::get(IntTy, 0)}; |
443 | Builder.CreateCall(RegisterVar, Args); |
444 | } |
445 | |
446 | Builder.CreateRetVoid(); |
447 | return RegisterKernelsFunc; |
448 | } |
449 | |
450 | |
451 | |
452 | |
453 | |
454 | |
455 | |
456 | |
457 | |
458 | |
459 | |
460 | |
461 | |
462 | |
463 | |
464 | |
465 | |
466 | |
467 | |
468 | |
469 | llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { |
470 | bool IsHIP = CGM.getLangOpts().HIP; |
471 | |
472 | StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; |
473 | if (CudaGpuBinaryFileName.empty() && !IsHIP) |
474 | return nullptr; |
475 | |
476 | |
477 | llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); |
478 | |
479 | |
480 | if (RelocatableDeviceCode && !RegisterGlobalsFunc) |
481 | RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); |
482 | |
483 | |
484 | llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( |
485 | llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), |
486 | addUnderscoredPrefixToName("RegisterFatBinary")); |
487 | |
488 | llvm::StructType *FatbinWrapperTy = |
489 | llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); |
490 | |
491 | |
492 | |
493 | |
494 | |
495 | std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; |
496 | if (!CudaGpuBinaryFileName.empty()) { |
497 | llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr = |
498 | llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName); |
499 | if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { |
500 | CGM.getDiags().Report(diag::err_cannot_open_file) |
501 | << CudaGpuBinaryFileName << EC.message(); |
502 | return nullptr; |
503 | } |
504 | CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); |
505 | } |
506 | |
507 | llvm::Function *ModuleCtorFunc = llvm::Function::Create( |
508 | llvm::FunctionType::get(VoidTy, VoidPtrTy, false), |
509 | llvm::GlobalValue::InternalLinkage, |
510 | addUnderscoredPrefixToName("_module_ctor"), &TheModule); |
511 | llvm::BasicBlock *CtorEntryBB = |
512 | llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); |
513 | CGBuilderTy CtorBuilder(CGM, Context); |
514 | |
515 | CtorBuilder.SetInsertPoint(CtorEntryBB); |
516 | |
517 | const char *FatbinConstantName; |
518 | const char *FatbinSectionName; |
519 | const char *ModuleIDSectionName; |
520 | StringRef ModuleIDPrefix; |
521 | llvm::Constant *FatBinStr; |
522 | unsigned FatMagic; |
523 | if (IsHIP) { |
524 | FatbinConstantName = ".hip_fatbin"; |
525 | FatbinSectionName = ".hipFatBinSegment"; |
526 | |
527 | ModuleIDSectionName = "__hip_module_id"; |
528 | ModuleIDPrefix = "__hip_"; |
529 | |
530 | if (CudaGpuBinary) { |
531 | |
532 | |
533 | FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", |
534 | FatbinConstantName, 8); |
535 | } else { |
536 | |
537 | |
538 | |
539 | |
540 | FatBinStr = new llvm::GlobalVariable( |
541 | CGM.getModule(), CGM.Int8Ty, |
542 | , llvm::GlobalValue::ExternalLinkage, nullptr, |
543 | "__hip_fatbin", nullptr, |
544 | llvm::GlobalVariable::NotThreadLocal); |
545 | cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); |
546 | } |
547 | |
548 | FatMagic = HIPFatMagic; |
549 | } else { |
550 | if (RelocatableDeviceCode) |
551 | FatbinConstantName = CGM.getTriple().isMacOSX() |
552 | ? "__NV_CUDA,__nv_relfatbin" |
553 | : "__nv_relfatbin"; |
554 | else |
555 | FatbinConstantName = |
556 | CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; |
557 | |
558 | FatbinSectionName = |
559 | CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; |
560 | |
561 | ModuleIDSectionName = CGM.getTriple().isMacOSX() |
562 | ? "__NV_CUDA,__nv_module_id" |
563 | : "__nv_module_id"; |
564 | ModuleIDPrefix = "__nv_"; |
565 | |
566 | |
567 | |
568 | FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", |
569 | FatbinConstantName, 8); |
570 | FatMagic = CudaFatMagic; |
571 | } |
572 | |
573 | |
574 | ConstantInitBuilder Builder(CGM); |
575 | auto Values = Builder.beginStruct(FatbinWrapperTy); |
576 | |
577 | Values.addInt(IntTy, FatMagic); |
578 | |
579 | Values.addInt(IntTy, 1); |
580 | |
581 | Values.add(FatBinStr); |
582 | |
583 | Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); |
584 | llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( |
585 | addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), |
586 | true); |
587 | FatbinWrapper->setSection(FatbinSectionName); |
588 | |
589 | |
590 | |
591 | |
592 | |
593 | |
594 | |
595 | |
596 | if (IsHIP) { |
597 | auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage : |
598 | llvm::GlobalValue::LinkOnceAnyLinkage; |
599 | llvm::BasicBlock *IfBlock = |
600 | llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); |
601 | llvm::BasicBlock *ExitBlock = |
602 | llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); |
603 | |
604 | |
605 | GpuBinaryHandle = new llvm::GlobalVariable( |
606 | TheModule, VoidPtrPtrTy, , |
607 | Linkage, |
608 | llvm::ConstantPointerNull::get(VoidPtrPtrTy), |
609 | "__hip_gpubin_handle"); |
610 | GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); |
611 | |
612 | if (Linkage != llvm::GlobalValue::InternalLinkage) |
613 | GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); |
614 | Address GpuBinaryAddr( |
615 | GpuBinaryHandle, |
616 | CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); |
617 | { |
618 | auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); |
619 | llvm::Constant *Zero = |
620 | llvm::Constant::getNullValue(HandleValue->getType()); |
621 | llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); |
622 | CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); |
623 | } |
624 | { |
625 | CtorBuilder.SetInsertPoint(IfBlock); |
626 | |
627 | llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( |
628 | RegisterFatbinFunc, |
629 | CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); |
630 | CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); |
631 | CtorBuilder.CreateBr(ExitBlock); |
632 | } |
633 | { |
634 | CtorBuilder.SetInsertPoint(ExitBlock); |
635 | |
636 | if (RegisterGlobalsFunc) { |
637 | auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); |
638 | CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); |
639 | } |
640 | } |
641 | } else if (!RelocatableDeviceCode) { |
642 | |
643 | |
644 | |
645 | llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( |
646 | RegisterFatbinFunc, |
647 | CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); |
648 | GpuBinaryHandle = new llvm::GlobalVariable( |
649 | TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, |
650 | llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); |
651 | GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); |
652 | CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, |
653 | CGM.getPointerAlign()); |
654 | |
655 | |
656 | if (RegisterGlobalsFunc) |
657 | CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); |
658 | |
659 | |
660 | if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), |
661 | CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { |
662 | |
663 | llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( |
664 | llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), |
665 | "__cudaRegisterFatBinaryEnd"); |
666 | CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); |
667 | } |
668 | } else { |
669 | |
670 | SmallString<64> ModuleID; |
671 | llvm::raw_svector_ostream OS(ModuleID); |
672 | OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID()); |
673 | llvm::Constant *ModuleIDConstant = |
674 | makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32); |
675 | |
676 | |
677 | llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, |
678 | Twine("__fatbinwrap") + ModuleID, FatbinWrapper); |
679 | |
680 | |
681 | |
682 | SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); |
683 | RegisterLinkedBinaryName += ModuleID; |
684 | llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( |
685 | getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); |
686 | |
687 | (0) . __assert_fail ("RegisterGlobalsFunc && \"Expecting at least dummy function!\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGCUDANV.cpp", 687, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); |
688 | llvm::Value *Args[] = {RegisterGlobalsFunc, |
689 | CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), |
690 | ModuleIDConstant, |
691 | makeDummyFunction(getCallbackFnTy())}; |
692 | CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); |
693 | } |
694 | |
695 | |
696 | |
697 | |
698 | if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { |
699 | |
700 | llvm::FunctionType *AtExitTy = |
701 | llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); |
702 | llvm::FunctionCallee AtExitFunc = |
703 | CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), |
704 | ); |
705 | CtorBuilder.CreateCall(AtExitFunc, CleanupFn); |
706 | } |
707 | |
708 | CtorBuilder.CreateRetVoid(); |
709 | return ModuleCtorFunc; |
710 | } |
711 | |
712 | |
713 | |
714 | |
715 | |
716 | |
717 | |
718 | |
719 | |
720 | |
721 | |
722 | |
723 | |
724 | |
725 | |
726 | |
727 | |
728 | |
729 | |
730 | |
731 | llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { |
732 | |
733 | if (!GpuBinaryHandle) |
734 | return nullptr; |
735 | |
736 | |
737 | llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( |
738 | llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), |
739 | addUnderscoredPrefixToName("UnregisterFatBinary")); |
740 | |
741 | llvm::Function *ModuleDtorFunc = llvm::Function::Create( |
742 | llvm::FunctionType::get(VoidTy, VoidPtrTy, false), |
743 | llvm::GlobalValue::InternalLinkage, |
744 | addUnderscoredPrefixToName("_module_dtor"), &TheModule); |
745 | |
746 | llvm::BasicBlock *DtorEntryBB = |
747 | llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); |
748 | CGBuilderTy DtorBuilder(CGM, Context); |
749 | DtorBuilder.SetInsertPoint(DtorEntryBB); |
750 | |
751 | Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity( |
752 | GpuBinaryHandle->getAlignment())); |
753 | auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); |
754 | |
755 | |
756 | |
757 | if (CGM.getLangOpts().HIP) { |
758 | llvm::BasicBlock *IfBlock = |
759 | llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); |
760 | llvm::BasicBlock *ExitBlock = |
761 | llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); |
762 | llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); |
763 | llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); |
764 | DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); |
765 | |
766 | DtorBuilder.SetInsertPoint(IfBlock); |
767 | DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); |
768 | DtorBuilder.CreateStore(Zero, GpuBinaryAddr); |
769 | DtorBuilder.CreateBr(ExitBlock); |
770 | |
771 | DtorBuilder.SetInsertPoint(ExitBlock); |
772 | } else { |
773 | DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); |
774 | } |
775 | DtorBuilder.CreateRetVoid(); |
776 | return ModuleDtorFunc; |
777 | } |
778 | |
779 | CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { |
780 | return new CGNVCUDARuntime(CGM); |
781 | } |
782 | |