Clang Project

clang_source_code/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
1//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a class for OpenMP runtime code generation specialized to NVPTX
10// targets.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeNVPTX.h"
15#include "CodeGenFunction.h"
16#include "clang/AST/DeclOpenMP.h"
17#include "clang/AST/StmtOpenMP.h"
18#include "clang/AST/StmtVisitor.h"
19#include "clang/Basic/Cuda.h"
20#include "llvm/ADT/SmallPtrSet.h"
21
22using namespace clang;
23using namespace CodeGen;
24
25namespace {
26enum OpenMPRTLFunctionNVPTX {
27  /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
28  /// int16_t RequiresOMPRuntime);
29  OMPRTL_NVPTX__kmpc_kernel_init,
30  /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
31  OMPRTL_NVPTX__kmpc_kernel_deinit,
32  /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
33  /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
34  OMPRTL_NVPTX__kmpc_spmd_kernel_init,
35  /// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
36  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2,
37  /// Call to void __kmpc_kernel_prepare_parallel(void
38  /// *outlined_function, int16_t
39  /// IsOMPRuntimeInitialized);
40  OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
41  /// Call to bool __kmpc_kernel_parallel(void **outlined_function,
42  /// int16_t IsOMPRuntimeInitialized);
43  OMPRTL_NVPTX__kmpc_kernel_parallel,
44  /// Call to void __kmpc_kernel_end_parallel();
45  OMPRTL_NVPTX__kmpc_kernel_end_parallel,
46  /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
47  /// global_tid);
48  OMPRTL_NVPTX__kmpc_serialized_parallel,
49  /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
50  /// global_tid);
51  OMPRTL_NVPTX__kmpc_end_serialized_parallel,
52  /// Call to int32_t __kmpc_shuffle_int32(int32_t element,
53  /// int16_t lane_offset, int16_t warp_size);
54  OMPRTL_NVPTX__kmpc_shuffle_int32,
55  /// Call to int64_t __kmpc_shuffle_int64(int64_t element,
56  /// int16_t lane_offset, int16_t warp_size);
57  OMPRTL_NVPTX__kmpc_shuffle_int64,
58  /// Call to __kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, kmp_int32
59  /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
60  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
61  /// lane_offset, int16_t shortCircuit),
62  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
63  OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2,
64  /// Call to __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
65  /// global_tid, void *global_buffer, int32_t num_of_records, void*
66  /// reduce_data,
67  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
68  /// lane_offset, int16_t shortCircuit),
69  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void
70  /// (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data),
71  /// void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx,
72  /// void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
73  /// int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void
74  /// *buffer, int idx, void *reduce_data));
75  OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2,
76  /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
77  OMPRTL_NVPTX__kmpc_end_reduce_nowait,
78  /// Call to void __kmpc_data_sharing_init_stack();
79  OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
80  /// Call to void __kmpc_data_sharing_init_stack_spmd();
81  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
82  /// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size,
83  /// int16_t UseSharedMemory);
84  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack,
85  /// Call to void __kmpc_data_sharing_pop_stack(void *a);
86  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
87  /// Call to void __kmpc_begin_sharing_variables(void ***args,
88  /// size_t n_args);
89  OMPRTL_NVPTX__kmpc_begin_sharing_variables,
90  /// Call to void __kmpc_end_sharing_variables();
91  OMPRTL_NVPTX__kmpc_end_sharing_variables,
92  /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
93  OMPRTL_NVPTX__kmpc_get_shared_variables,
94  /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
95  /// global_tid);
96  OMPRTL_NVPTX__kmpc_parallel_level,
97  /// Call to int8_t __kmpc_is_spmd_exec_mode();
98  OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
99  /// Call to void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
100  /// const void *buf, size_t size, int16_t is_shared, const void **res);
101  OMPRTL_NVPTX__kmpc_get_team_static_memory,
102  /// Call to void __kmpc_restore_team_static_memory(int16_t
103  /// isSPMDExecutionMode, int16_t is_shared);
104  OMPRTL_NVPTX__kmpc_restore_team_static_memory,
105  /// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
106  OMPRTL__kmpc_barrier,
107  /// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
108  /// global_tid);
109  OMPRTL__kmpc_barrier_simple_spmd,
110};
111
112/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
113class NVPTXActionTy final : public PrePostActionTy {
114  llvm::FunctionCallee EnterCallee = nullptr;
115  ArrayRef<llvm::Value *> EnterArgs;
116  llvm::FunctionCallee ExitCallee = nullptr;
117  ArrayRef<llvm::Value *> ExitArgs;
118  bool Conditional = false;
119  llvm::BasicBlock *ContBlock = nullptr;
120
121public:
122  NVPTXActionTy(llvm::FunctionCallee EnterCallee,
123                ArrayRef<llvm::Value *> EnterArgs,
124                llvm::FunctionCallee ExitCallee,
125                ArrayRef<llvm::Value *> ExitArgsbool Conditional = false)
126      : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
127        ExitArgs(ExitArgs), Conditional(Conditional) {}
128  void Enter(CodeGenFunction &CGF) override {
129    llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
130    if (Conditional) {
131      llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
132      auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
133      ContBlock = CGF.createBasicBlock("omp_if.end");
134      // Generate the branch (If-stmt)
135      CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
136      CGF.EmitBlock(ThenBlock);
137    }
138  }
139  void Done(CodeGenFunction &CGF) {
140    // Emit the rest of blocks/branches
141    CGF.EmitBranch(ContBlock);
142    CGF.EmitBlock(ContBlocktrue);
143  }
144  void Exit(CodeGenFunction &CGF) override {
145    CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
146  }
147};
148
149/// A class to track the execution mode when codegening directives within
150/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
151/// to the target region and used by containing directives such as 'parallel'
152/// to emit optimized code.
153class ExecutionRuntimeModesRAII {
154private:
155  CGOpenMPRuntimeNVPTX::ExecutionMode SavedExecMode =
156      CGOpenMPRuntimeNVPTX::EM_Unknown;
157  CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode;
158  bool SavedRuntimeMode = false;
159  bool *RuntimeMode = nullptr;
160
161public:
162  /// Constructor for Non-SPMD mode.
163  ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode)
164      : ExecMode(ExecMode) {
165    SavedExecMode = ExecMode;
166    ExecMode = CGOpenMPRuntimeNVPTX::EM_NonSPMD;
167  }
168  /// Constructor for SPMD mode.
169  ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode,
170                            bool &RuntimeModebool FullRuntimeMode)
171      : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
172    SavedExecMode = ExecMode;
173    SavedRuntimeMode = RuntimeMode;
174    ExecMode = CGOpenMPRuntimeNVPTX::EM_SPMD;
175    RuntimeMode = FullRuntimeMode;
176  }
177  ~ExecutionRuntimeModesRAII() {
178    ExecMode = SavedExecMode;
179    if (RuntimeMode)
180      *RuntimeMode = SavedRuntimeMode;
181  }
182};
183
184/// GPU Configuration:  This information can be derived from cuda registers,
185/// however, providing compile time constants helps generate more efficient
186/// code.  For all practical purposes this is fine because the configuration
187/// is the same for all known NVPTX architectures.
188enum MachineConfiguration : unsigned {
189  WarpSize = 32,
190  /// Number of bits required to represent a lane identifier, which is
191  /// computed as log_2(WarpSize).
192  LaneIDBits = 5,
193  LaneIDMask = WarpSize - 1,
194
195  /// Global memory alignment for performance.
196  GlobalMemoryAlignment = 128,
197
198  /// Maximal size of the shared memory buffer.
199  SharedMemorySize = 128,
200};
201
202static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
203  RefExpr = RefExpr->IgnoreParens();
204  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
205    const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
206    while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
207      Base = TempASE->getBase()->IgnoreParenImpCasts();
208    RefExpr = Base;
209  } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
210    const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
211    while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
212      Base = TempOASE->getBase()->IgnoreParenImpCasts();
213    while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
214      Base = TempASE->getBase()->IgnoreParenImpCasts();
215    RefExpr = Base;
216  }
217  RefExpr = RefExpr->IgnoreParenImpCasts();
218  if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
219    return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
220  const auto *ME = cast<MemberExpr>(RefExpr);
221  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
222}
223
224typedef std::pair<CharUnits /*Align*/const ValueDecl *> VarsDataTy;
225static bool stable_sort_comparator(const VarsDataTy P1const VarsDataTy P2) {
226  return P1.first > P2.first;
227}
228
229static RecordDecl *buildRecordForGlobalizedVars(
230    ASTContext &CArrayRef<const ValueDecl *> EscapedDecls,
231    ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
232    llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
233        &MappedDeclsFieldsint BufSize) {
234  if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
235    return nullptr;
236  SmallVector<VarsDataTy4GlobalizedVars;
237  for (const ValueDecl *D : EscapedDecls)
238    GlobalizedVars.emplace_back(
239        CharUnits::fromQuantity(std::max(
240            C.getDeclAlign(D).getQuantity(),
241            static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
242        D);
243  for (const ValueDecl *D : EscapedDeclsForTeams)
244    GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
245  std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
246                   stable_sort_comparator);
247  // Build struct _globalized_locals_ty {
248  //         /*  globalized vars  */[WarSize] align (max(decl_align,
249  //         GlobalMemoryAlignment))
250  //         /*  globalized vars  */ for EscapedDeclsForTeams
251  //       };
252  RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
253  GlobalizedRD->startDefinition();
254  llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
255      EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
256  for (const auto &Pair : GlobalizedVars) {
257    const ValueDecl *VD = Pair.second;
258    QualType Type = VD->getType();
259    if (Type->isLValueReferenceType())
260      Type = C.getPointerType(Type.getNonReferenceType());
261    else
262      Type = Type.getNonReferenceType();
263    SourceLocation Loc = VD->getLocation();
264    FieldDecl *Field;
265    if (SingleEscaped.count(VD)) {
266      Field = FieldDecl::Create(
267          C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
268          C.getTrivialTypeSourceInfo(Type, SourceLocation()),
269          /*BW=*/nullptr/*Mutable=*/false,
270          /*InitStyle=*/ICIS_NoInit);
271      Field->setAccess(AS_public);
272      if (VD->hasAttrs()) {
273        for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
274             E(VD->getAttrs().end());
275             I != E; ++I)
276          Field->addAttr(*I);
277      }
278    } else {
279      llvm::APInt ArraySize(32, BufSize);
280      Type = C.getConstantArrayType(Type, ArraySize, ArrayType::Normal, 0);
281      Field = FieldDecl::Create(
282          C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
283          C.getTrivialTypeSourceInfo(Type, SourceLocation()),
284          /*BW=*/nullptr/*Mutable=*/false,
285          /*InitStyle=*/ICIS_NoInit);
286      Field->setAccess(AS_public);
287      llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
288                                     static_cast<CharUnits::QuantityType>(
289                                         GlobalMemoryAlignment)));
290      Field->addAttr(AlignedAttr::CreateImplicit(
291          C, AlignedAttr::GNU_aligned, /*IsAlignmentExpr=*/true,
292          IntegerLiteral::Create(C, Align,
293                                 C.getIntTypeForBitwidth(32/*Signed=*/0),
294                                 SourceLocation())));
295    }
296    GlobalizedRD->addDecl(Field);
297    MappedDeclsFields.try_emplace(VD, Field);
298  }
299  GlobalizedRD->completeDefinition();
300  return GlobalizedRD;
301}
302
303/// Get the list of variables that can escape their declaration context.
304class CheckVarsEscapingDeclContext final
305    : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
306  CodeGenFunction &CGF;
307  llvm::SetVector<const ValueDecl *> EscapedDecls;
308  llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
309  llvm::SmallPtrSet<const Decl *, 4EscapedParameters;
310  RecordDecl *GlobalizedRD = nullptr;
311  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
312  bool AllEscaped = false;
313  bool IsForCombinedParallelRegion = false;
314
315  void markAsEscaped(const ValueDecl *VD) {
316    // Do not globalize declare target variables.
317    if (!isa<VarDecl>(VD) ||
318        OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
319      return;
320    VD = cast<ValueDecl>(VD->getCanonicalDecl());
321    // Variables captured by value must be globalized.
322    if (auto *CSI = CGF.CapturedStmtInfo) {
323      if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
324        // Check if need to capture the variable that was already captured by
325        // value in the outer region.
326        if (!IsForCombinedParallelRegion) {
327          if (!FD->hasAttrs())
328            return;
329          const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
330          if (!Attr)
331            return;
332          if (((Attr->getCaptureKind() != OMPC_map) &&
333               !isOpenMPPrivate(
334                   static_cast<OpenMPClauseKind>(Attr->getCaptureKind()))) ||
335              ((Attr->getCaptureKind() == OMPC_map) &&
336               !FD->getType()->isAnyPointerType()))
337            return;
338        }
339        if (!FD->getType()->isReferenceType()) {
340           (0) . __assert_fail ("!VD->getType()->isVariablyModifiedType() && \"Parameter captured by value with variably modified type\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 341, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(!VD->getType()->isVariablyModifiedType() &&
341 (0) . __assert_fail ("!VD->getType()->isVariablyModifiedType() && \"Parameter captured by value with variably modified type\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 341, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">                 "Parameter captured by value with variably modified type");
342          EscapedParameters.insert(VD);
343        } else if (!IsForCombinedParallelRegion) {
344          return;
345        }
346      }
347    }
348    if ((!CGF.CapturedStmtInfo ||
349         (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
350        VD->getType()->isReferenceType())
351      // Do not globalize variables with reference type.
352      return;
353    if (VD->getType()->isVariablyModifiedType())
354      EscapedVariableLengthDecls.insert(VD);
355    else
356      EscapedDecls.insert(VD);
357  }
358
359  void VisitValueDecl(const ValueDecl *VD) {
360    if (VD->getType()->isLValueReferenceType())
361      markAsEscaped(VD);
362    if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
363      if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
364        const bool SavedAllEscaped = AllEscaped;
365        AllEscaped = VD->getType()->isLValueReferenceType();
366        Visit(VarD->getInit());
367        AllEscaped = SavedAllEscaped;
368      }
369    }
370  }
371  void VisitOpenMPCapturedStmt(const CapturedStmt *S,
372                               ArrayRef<OMPClause *> Clauses,
373                               bool IsCombinedParallelRegion) {
374    if (!S)
375      return;
376    for (const CapturedStmt::Capture &C : S->captures()) {
377      if (C.capturesVariable() && !C.capturesVariableByCopy()) {
378        const ValueDecl *VD = C.getCapturedVar();
379        bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
380        if (IsCombinedParallelRegion) {
381          // Check if the variable is privatized in the combined construct and
382          // those private copies must be shared in the inner parallel
383          // directive.
384          IsForCombinedParallelRegion = false;
385          for (const OMPClause *C : Clauses) {
386            if (!isOpenMPPrivate(C->getClauseKind()) ||
387                C->getClauseKind() == OMPC_reduction ||
388                C->getClauseKind() == OMPC_linear ||
389                C->getClauseKind() == OMPC_private)
390              continue;
391            ArrayRef<const Expr *> Vars;
392            if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
393              Vars = PC->getVarRefs();
394            else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
395              Vars = PC->getVarRefs();
396            else
397              llvm_unreachable("Unexpected clause.");
398            for (const auto *E : Vars) {
399              const Decl *D =
400                  cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
401              if (D == VD->getCanonicalDecl()) {
402                IsForCombinedParallelRegion = true;
403                break;
404              }
405            }
406            if (IsForCombinedParallelRegion)
407              break;
408          }
409        }
410        markAsEscaped(VD);
411        if (isa<OMPCapturedExprDecl>(VD))
412          VisitValueDecl(VD);
413        IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
414      }
415    }
416  }
417
418  void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
419     (0) . __assert_fail ("!GlobalizedRD && \"Record for globalized variables is built already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 420, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(!GlobalizedRD &&
420 (0) . __assert_fail ("!GlobalizedRD && \"Record for globalized variables is built already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 420, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">           "Record for globalized variables is built already.");
421    ArrayRef<const ValueDecl *> EscapedDeclsForParallelEscapedDeclsForTeams;
422    if (IsInTTDRegion)
423      EscapedDeclsForTeams = EscapedDecls.getArrayRef();
424    else
425      EscapedDeclsForParallel = EscapedDecls.getArrayRef();
426    GlobalizedRD = ::buildRecordForGlobalizedVars(
427        CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
428        MappedDeclsFields, WarpSize);
429  }
430
431public:
432  CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
433                               ArrayRef<const ValueDecl *> TeamsReductions)
434      : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
435  }
436  virtual ~CheckVarsEscapingDeclContext() = default;
437  void VisitDeclStmt(const DeclStmt *S) {
438    if (!S)
439      return;
440    for (const Decl *D : S->decls())
441      if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
442        VisitValueDecl(VD);
443  }
444  void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
445    if (!D)
446      return;
447    if (!D->hasAssociatedStmt())
448      return;
449    if (const auto *S =
450            dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
451      // Do not analyze directives that do not actually require capturing,
452      // like `omp for` or `omp simd` directives.
453      llvm::SmallVector<OpenMPDirectiveKind4CaptureRegions;
454      getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
455      if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
456        VisitStmt(S->getCapturedStmt());
457        return;
458      }
459      VisitOpenMPCapturedStmt(
460          S, D->clauses(),
461          CaptureRegions.back() == OMPD_parallel &&
462              isOpenMPDistributeDirective(D->getDirectiveKind()));
463    }
464  }
465  void VisitCapturedStmt(const CapturedStmt *S) {
466    if (!S)
467      return;
468    for (const CapturedStmt::Capture &C : S->captures()) {
469      if (C.capturesVariable() && !C.capturesVariableByCopy()) {
470        const ValueDecl *VD = C.getCapturedVar();
471        markAsEscaped(VD);
472        if (isa<OMPCapturedExprDecl>(VD))
473          VisitValueDecl(VD);
474      }
475    }
476  }
477  void VisitLambdaExpr(const LambdaExpr *E) {
478    if (!E)
479      return;
480    for (const LambdaCapture &C : E->captures()) {
481      if (C.capturesVariable()) {
482        if (C.getCaptureKind() == LCK_ByRef) {
483          const ValueDecl *VD = C.getCapturedVar();
484          markAsEscaped(VD);
485          if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
486            VisitValueDecl(VD);
487        }
488      }
489    }
490  }
491  void VisitBlockExpr(const BlockExpr *E) {
492    if (!E)
493      return;
494    for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
495      if (C.isByRef()) {
496        const VarDecl *VD = C.getVariable();
497        markAsEscaped(VD);
498        if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
499          VisitValueDecl(VD);
500      }
501    }
502  }
503  void VisitCallExpr(const CallExpr *E) {
504    if (!E)
505      return;
506    for (const Expr *Arg : E->arguments()) {
507      if (!Arg)
508        continue;
509      if (Arg->isLValue()) {
510        const bool SavedAllEscaped = AllEscaped;
511        AllEscaped = true;
512        Visit(Arg);
513        AllEscaped = SavedAllEscaped;
514      } else {
515        Visit(Arg);
516      }
517    }
518    Visit(E->getCallee());
519  }
520  void VisitDeclRefExpr(const DeclRefExpr *E) {
521    if (!E)
522      return;
523    const ValueDecl *VD = E->getDecl();
524    if (AllEscaped)
525      markAsEscaped(VD);
526    if (isa<OMPCapturedExprDecl>(VD))
527      VisitValueDecl(VD);
528    else if (const auto *VarD = dyn_cast<VarDecl>(VD))
529      if (VarD->isInitCapture())
530        VisitValueDecl(VD);
531  }
532  void VisitUnaryOperator(const UnaryOperator *E) {
533    if (!E)
534      return;
535    if (E->getOpcode() == UO_AddrOf) {
536      const bool SavedAllEscaped = AllEscaped;
537      AllEscaped = true;
538      Visit(E->getSubExpr());
539      AllEscaped = SavedAllEscaped;
540    } else {
541      Visit(E->getSubExpr());
542    }
543  }
544  void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
545    if (!E)
546      return;
547    if (E->getCastKind() == CK_ArrayToPointerDecay) {
548      const bool SavedAllEscaped = AllEscaped;
549      AllEscaped = true;
550      Visit(E->getSubExpr());
551      AllEscaped = SavedAllEscaped;
552    } else {
553      Visit(E->getSubExpr());
554    }
555  }
556  void VisitExpr(const Expr *E) {
557    if (!E)
558      return;
559    bool SavedAllEscaped = AllEscaped;
560    if (!E->isLValue())
561      AllEscaped = false;
562    for (const Stmt *Child : E->children())
563      if (Child)
564        Visit(Child);
565    AllEscaped = SavedAllEscaped;
566  }
567  void VisitStmt(const Stmt *S) {
568    if (!S)
569      return;
570    for (const Stmt *Child : S->children())
571      if (Child)
572        Visit(Child);
573  }
574
575  /// Returns the record that handles all the escaped local variables and used
576  /// instead of their original storage.
577  const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
578    if (!GlobalizedRD)
579      buildRecordForGlobalizedVars(IsInTTDRegion);
580    return GlobalizedRD;
581  }
582
583  /// Returns the field in the globalized record for the escaped variable.
584  const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VDconst {
585     (0) . __assert_fail ("GlobalizedRD && \"Record for globalized variables must be generated already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 586, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(GlobalizedRD &&
586 (0) . __assert_fail ("GlobalizedRD && \"Record for globalized variables must be generated already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 586, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">           "Record for globalized variables must be generated already.");
587    auto I = MappedDeclsFields.find(VD);
588    if (I == MappedDeclsFields.end())
589      return nullptr;
590    return I->getSecond();
591  }
592
593  /// Returns the list of the escaped local variables/parameters.
594  ArrayRef<const ValueDecl *> getEscapedDecls() const {
595    return EscapedDecls.getArrayRef();
596  }
597
598  /// Checks if the escaped local variable is actually a parameter passed by
599  /// value.
600  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
601    return EscapedParameters;
602  }
603
604  /// Returns the list of the escaped variables with the variably modified
605  /// types.
606  ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
607    return EscapedVariableLengthDecls.getArrayRef();
608  }
609};
610// anonymous namespace
611
612/// Get the GPU warp size.
613static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
614  return CGF.EmitRuntimeCall(
615      llvm::Intrinsic::getDeclaration(
616          &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
617      "nvptx_warp_size");
618}
619
620/// Get the id of the current thread on the GPU.
621static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
622  return CGF.EmitRuntimeCall(
623      llvm::Intrinsic::getDeclaration(
624          &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
625      "nvptx_tid");
626}
627
628/// Get the id of the warp in the block.
629/// We assume that the warp size is 32, which is always the case
630/// on the NVPTX device, to generate more efficient code.
631static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
632  CGBuilderTy &Bld = CGF.Builder;
633  return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits"nvptx_warp_id");
634}
635
636/// Get the id of the current lane in the Warp.
637/// We assume that the warp size is 32, which is always the case
638/// on the NVPTX device, to generate more efficient code.
639static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
640  CGBuilderTy &Bld = CGF.Builder;
641  return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
642                       "nvptx_lane_id");
643}
644
645/// Get the maximum number of threads in a block of the GPU.
646static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
647  return CGF.EmitRuntimeCall(
648      llvm::Intrinsic::getDeclaration(
649          &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
650      "nvptx_num_threads");
651}
652
653/// Get the value of the thread_limit clause in the teams directive.
654/// For the 'generic' execution mode, the runtime encodes thread_limit in
655/// the launch parameters, always starting thread_limit+warpSize threads per
656/// CTA. The threads in the last warp are reserved for master execution.
657/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
658static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
659                                   bool IsInSPMDExecutionMode = false) {
660  CGBuilderTy &Bld = CGF.Builder;
661  return IsInSPMDExecutionMode
662             ? getNVPTXNumThreads(CGF)
663             : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
664                                "thread_limit");
665}
666
667/// Get the thread id of the OMP master thread.
668/// The master thread id is the first thread (lane) of the last warp in the
669/// GPU block.  Warp size is assumed to be some power of 2.
670/// Thread id is 0 indexed.
671/// E.g: If NumThreads is 33, master id is 32.
672///      If NumThreads is 64, master id is 32.
673///      If NumThreads is 1024, master id is 992.
674static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
675  CGBuilderTy &Bld = CGF.Builder;
676  llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
677
678  // We assume that the warp size is a power of 2.
679  llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
680
681  return Bld.CreateAnd(Bld.CreateNUWSub(NumThreadsBld.getInt32(1)),
682                       Bld.CreateNot(Mask), "master_tid");
683}
684
685CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
686    CodeGenModule &CGMSourceLocation Loc)
687    : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
688      Loc(Loc) {
689  createWorkerFunction(CGM);
690}
691
692void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
693    CodeGenModule &CGM) {
694  // Create an worker function with no arguments.
695
696  WorkerFn = llvm::Function::Create(
697      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
698      /*placeholder=*/"_worker", &CGM.getModule());
699  CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFnCGFI);
700  WorkerFn->setDoesNotRecurse();
701}
702
703CGOpenMPRuntimeNVPTX::ExecutionMode
704CGOpenMPRuntimeNVPTX::getExecutionMode() const {
705  return CurrentExecutionMode;
706}
707
708static CGOpenMPRuntimeNVPTX::DataSharingMode
709getDataSharingMode(CodeGenModule &CGM) {
710  return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
711                                          : CGOpenMPRuntimeNVPTX::Generic;
712}
713
714/// Checks if the expression is constant or does not have non-trivial function
715/// calls.
716static bool isTrivial(ASTContext &Ctxconst Expr * E) {
717  // We can skip constant expressions.
718  // We can skip expressions with trivial calls or simple expressions.
719  return (E->isEvaluatable(CtxExpr::SE_AllowUndefinedBehavior) ||
720          !E->hasNonTrivialCall(Ctx)) &&
721         !E->HasSideEffects(Ctx/*IncludePossibleEffects=*/true);
722}
723
724/// Checks if the \p Body is the \a CompoundStmt and returns its child statement
725/// iff there is only one that is not evaluatable at the compile time.
726static const Stmt *getSingleCompoundChild(ASTContext &Ctxconst Stmt *Body) {
727  if (const auto *C = dyn_cast<CompoundStmt>(Body)) {
728    const Stmt *Child = nullptr;
729    for (const Stmt *S : C->body()) {
730      if (const auto *E = dyn_cast<Expr>(S)) {
731        if (isTrivial(Ctx, E))
732          continue;
733      }
734      // Some of the statements can be ignored.
735      if (isa<AsmStmt>(S) || isa<NullStmt>(S) || isa<OMPFlushDirective>(S) ||
736          isa<OMPBarrierDirective>(S) || isa<OMPTaskyieldDirective>(S))
737        continue;
738      // Analyze declarations.
739      if (const auto *DS = dyn_cast<DeclStmt>(S)) {
740        if (llvm::all_of(DS->decls(), [&Ctx](const Decl *D) {
741              if (isa<EmptyDecl>(D) || isa<DeclContext>(D) ||
742                  isa<TypeDecl>(D) || isa<PragmaCommentDecl>(D) ||
743                  isa<PragmaDetectMismatchDecl>(D) || isa<UsingDecl>(D) ||
744                  isa<UsingDirectiveDecl>(D) ||
745                  isa<OMPDeclareReductionDecl>(D) ||
746                  isa<OMPThreadPrivateDecl>(D) || isa<OMPAllocateDecl>(D))
747                return true;
748              const auto *VD = dyn_cast<VarDecl>(D);
749              if (!VD)
750                return false;
751              return VD->isConstexpr() ||
752                     ((VD->getType().isTrivialType(Ctx) ||
753                       VD->getType()->isReferenceType()) &&
754                      (!VD->hasInit() || isTrivial(Ctx, VD->getInit())));
755            }))
756          continue;
757      }
758      // Found multiple children - cannot get the one child only.
759      if (Child)
760        return Body;
761      Child = S;
762    }
763    if (Child)
764      return Child;
765  }
766  return Body;
767}
768
769/// Check if the parallel directive has an 'if' clause with non-constant or
770/// false condition. Also, check if the number of threads is strictly specified
771/// and run those directives in non-SPMD mode.
772static bool hasParallelIfNumThreadsClause(ASTContext &Ctx,
773                                          const OMPExecutableDirective &D) {
774  if (D.hasClausesOfKind<OMPNumThreadsClause>())
775    return true;
776  for (const auto *C : D.getClausesOfKind<OMPIfClause>()) {
777    OpenMPDirectiveKind NameModifier = C->getNameModifier();
778    if (NameModifier != OMPD_parallel && NameModifier != OMPD_unknown)
779      continue;
780    const Expr *Cond = C->getCondition();
781    bool Result;
782    if (!Cond->EvaluateAsBooleanCondition(Result, Ctx) || !Result)
783      return true;
784  }
785  return false;
786}
787
788/// Check for inner (nested) SPMD construct, if any
789static bool hasNestedSPMDDirective(ASTContext &Ctx,
790                                   const OMPExecutableDirective &D) {
791  const auto *CS = D.getInnermostCapturedStmt();
792  const auto *Body =
793      CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
794  const Stmt *ChildStmt = getSingleCompoundChild(CtxBody);
795
796  if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
797    OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
798    switch (D.getDirectiveKind()) {
799    case OMPD_target:
800      if (isOpenMPParallelDirective(DKind) &&
801          !hasParallelIfNumThreadsClause(Ctx, *NestedDir))
802        return true;
803      if (DKind == OMPD_teams) {
804        Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
805            /*IgnoreCaptured=*/true);
806        if (!Body)
807          return false;
808        ChildStmt = getSingleCompoundChild(CtxBody);
809        if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
810          DKind = NND->getDirectiveKind();
811          if (isOpenMPParallelDirective(DKind) &&
812              !hasParallelIfNumThreadsClause(Ctx, *NND))
813            return true;
814        }
815      }
816      return false;
817    case OMPD_target_teams:
818      return isOpenMPParallelDirective(DKind) &&
819             !hasParallelIfNumThreadsClause(Ctx, *NestedDir);
820    case OMPD_target_simd:
821    case OMPD_target_parallel:
822    case OMPD_target_parallel_for:
823    case OMPD_target_parallel_for_simd:
824    case OMPD_target_teams_distribute:
825    case OMPD_target_teams_distribute_simd:
826    case OMPD_target_teams_distribute_parallel_for:
827    case OMPD_target_teams_distribute_parallel_for_simd:
828    case OMPD_parallel:
829    case OMPD_for:
830    case OMPD_parallel_for:
831    case OMPD_parallel_sections:
832    case OMPD_for_simd:
833    case OMPD_parallel_for_simd:
834    case OMPD_cancel:
835    case OMPD_cancellation_point:
836    case OMPD_ordered:
837    case OMPD_threadprivate:
838    case OMPD_allocate:
839    case OMPD_task:
840    case OMPD_simd:
841    case OMPD_sections:
842    case OMPD_section:
843    case OMPD_single:
844    case OMPD_master:
845    case OMPD_critical:
846    case OMPD_taskyield:
847    case OMPD_barrier:
848    case OMPD_taskwait:
849    case OMPD_taskgroup:
850    case OMPD_atomic:
851    case OMPD_flush:
852    case OMPD_teams:
853    case OMPD_target_data:
854    case OMPD_target_exit_data:
855    case OMPD_target_enter_data:
856    case OMPD_distribute:
857    case OMPD_distribute_simd:
858    case OMPD_distribute_parallel_for:
859    case OMPD_distribute_parallel_for_simd:
860    case OMPD_teams_distribute:
861    case OMPD_teams_distribute_simd:
862    case OMPD_teams_distribute_parallel_for:
863    case OMPD_teams_distribute_parallel_for_simd:
864    case OMPD_target_update:
865    case OMPD_declare_simd:
866    case OMPD_declare_target:
867    case OMPD_end_declare_target:
868    case OMPD_declare_reduction:
869    case OMPD_declare_mapper:
870    case OMPD_taskloop:
871    case OMPD_taskloop_simd:
872    case OMPD_requires:
873    case OMPD_unknown:
874      llvm_unreachable("Unexpected directive.");
875    }
876  }
877
878  return false;
879}
880
881static bool supportsSPMDExecutionMode(ASTContext &Ctx,
882                                      const OMPExecutableDirective &D) {
883  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
884  switch (DirectiveKind) {
885  case OMPD_target:
886  case OMPD_target_teams:
887    return hasNestedSPMDDirective(CtxD);
888  case OMPD_target_parallel:
889  case OMPD_target_parallel_for:
890  case OMPD_target_parallel_for_simd:
891  case OMPD_target_teams_distribute_parallel_for:
892  case OMPD_target_teams_distribute_parallel_for_simd:
893    return !hasParallelIfNumThreadsClause(CtxD);
894  case OMPD_target_simd:
895  case OMPD_target_teams_distribute:
896  case OMPD_target_teams_distribute_simd:
897    return false;
898  case OMPD_parallel:
899  case OMPD_for:
900  case OMPD_parallel_for:
901  case OMPD_parallel_sections:
902  case OMPD_for_simd:
903  case OMPD_parallel_for_simd:
904  case OMPD_cancel:
905  case OMPD_cancellation_point:
906  case OMPD_ordered:
907  case OMPD_threadprivate:
908  case OMPD_allocate:
909  case OMPD_task:
910  case OMPD_simd:
911  case OMPD_sections:
912  case OMPD_section:
913  case OMPD_single:
914  case OMPD_master:
915  case OMPD_critical:
916  case OMPD_taskyield:
917  case OMPD_barrier:
918  case OMPD_taskwait:
919  case OMPD_taskgroup:
920  case OMPD_atomic:
921  case OMPD_flush:
922  case OMPD_teams:
923  case OMPD_target_data:
924  case OMPD_target_exit_data:
925  case OMPD_target_enter_data:
926  case OMPD_distribute:
927  case OMPD_distribute_simd:
928  case OMPD_distribute_parallel_for:
929  case OMPD_distribute_parallel_for_simd:
930  case OMPD_teams_distribute:
931  case OMPD_teams_distribute_simd:
932  case OMPD_teams_distribute_parallel_for:
933  case OMPD_teams_distribute_parallel_for_simd:
934  case OMPD_target_update:
935  case OMPD_declare_simd:
936  case OMPD_declare_target:
937  case OMPD_end_declare_target:
938  case OMPD_declare_reduction:
939  case OMPD_declare_mapper:
940  case OMPD_taskloop:
941  case OMPD_taskloop_simd:
942  case OMPD_requires:
943  case OMPD_unknown:
944    break;
945  }
946  llvm_unreachable(
947      "Unknown programming model for OpenMP directive on NVPTX target.");
948}
949
950/// Check if the directive is loops based and has schedule clause at all or has
951/// static scheduling.
952static bool hasStaticScheduling(const OMPExecutableDirective &D) {
953   (0) . __assert_fail ("isOpenMPWorksharingDirective(D.getDirectiveKind()) && isOpenMPLoopDirective(D.getDirectiveKind()) && \"Expected loop-based directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 955, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
954 (0) . __assert_fail ("isOpenMPWorksharingDirective(D.getDirectiveKind()) && isOpenMPLoopDirective(D.getDirectiveKind()) && \"Expected loop-based directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 955, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">         isOpenMPLoopDirective(D.getDirectiveKind()) &&
955 (0) . __assert_fail ("isOpenMPWorksharingDirective(D.getDirectiveKind()) && isOpenMPLoopDirective(D.getDirectiveKind()) && \"Expected loop-based directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 955, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">         "Expected loop-based directive.");
956  return !D.hasClausesOfKind<OMPOrderedClause>() &&
957         (!D.hasClausesOfKind<OMPScheduleClause>() ||
958          llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
959                       [](const OMPScheduleClause *C) {
960                         return C->getScheduleKind() == OMPC_SCHEDULE_static;
961                       }));
962}
963
964/// Check for inner (nested) lightweight runtime construct, if any
965static bool hasNestedLightweightDirective(ASTContext &Ctx,
966                                          const OMPExecutableDirective &D) {
967   (0) . __assert_fail ("supportsSPMDExecutionMode(Ctx, D) && \"Expected SPMD mode directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 967, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
968  const auto *CS = D.getInnermostCapturedStmt();
969  const auto *Body =
970      CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
971  const Stmt *ChildStmt = getSingleCompoundChild(CtxBody);
972
973  if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
974    OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
975    switch (D.getDirectiveKind()) {
976    case OMPD_target:
977      if (isOpenMPParallelDirective(DKind) &&
978          isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
979          hasStaticScheduling(*NestedDir))
980        return true;
981      if (DKind == OMPD_parallel) {
982        Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
983            /*IgnoreCaptured=*/true);
984        if (!Body)
985          return false;
986        ChildStmt = getSingleCompoundChild(CtxBody);
987        if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
988          DKind = NND->getDirectiveKind();
989          if (isOpenMPWorksharingDirective(DKind) &&
990              isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
991            return true;
992        }
993      } else if (DKind == OMPD_teams) {
994        Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
995            /*IgnoreCaptured=*/true);
996        if (!Body)
997          return false;
998        ChildStmt = getSingleCompoundChild(CtxBody);
999        if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
1000          DKind = NND->getDirectiveKind();
1001          if (isOpenMPParallelDirective(DKind) &&
1002              isOpenMPWorksharingDirective(DKind) &&
1003              isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
1004            return true;
1005          if (DKind == OMPD_parallel) {
1006            Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
1007                /*IgnoreCaptured=*/true);
1008            if (!Body)
1009              return false;
1010            ChildStmt = getSingleCompoundChild(CtxBody);
1011            if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
1012              DKind = NND->getDirectiveKind();
1013              if (isOpenMPWorksharingDirective(DKind) &&
1014                  isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
1015                return true;
1016            }
1017          }
1018        }
1019      }
1020      return false;
1021    case OMPD_target_teams:
1022      if (isOpenMPParallelDirective(DKind) &&
1023          isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
1024          hasStaticScheduling(*NestedDir))
1025        return true;
1026      if (DKind == OMPD_parallel) {
1027        Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
1028            /*IgnoreCaptured=*/true);
1029        if (!Body)
1030          return false;
1031        ChildStmt = getSingleCompoundChild(CtxBody);
1032        if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
1033          DKind = NND->getDirectiveKind();
1034          if (isOpenMPWorksharingDirective(DKind) &&
1035              isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
1036            return true;
1037        }
1038      }
1039      return false;
1040    case OMPD_target_parallel:
1041      return isOpenMPWorksharingDirective(DKind) &&
1042             isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
1043    case OMPD_target_teams_distribute:
1044    case OMPD_target_simd:
1045    case OMPD_target_parallel_for:
1046    case OMPD_target_parallel_for_simd:
1047    case OMPD_target_teams_distribute_simd:
1048    case OMPD_target_teams_distribute_parallel_for:
1049    case OMPD_target_teams_distribute_parallel_for_simd:
1050    case OMPD_parallel:
1051    case OMPD_for:
1052    case OMPD_parallel_for:
1053    case OMPD_parallel_sections:
1054    case OMPD_for_simd:
1055    case OMPD_parallel_for_simd:
1056    case OMPD_cancel:
1057    case OMPD_cancellation_point:
1058    case OMPD_ordered:
1059    case OMPD_threadprivate:
1060    case OMPD_allocate:
1061    case OMPD_task:
1062    case OMPD_simd:
1063    case OMPD_sections:
1064    case OMPD_section:
1065    case OMPD_single:
1066    case OMPD_master:
1067    case OMPD_critical:
1068    case OMPD_taskyield:
1069    case OMPD_barrier:
1070    case OMPD_taskwait:
1071    case OMPD_taskgroup:
1072    case OMPD_atomic:
1073    case OMPD_flush:
1074    case OMPD_teams:
1075    case OMPD_target_data:
1076    case OMPD_target_exit_data:
1077    case OMPD_target_enter_data:
1078    case OMPD_distribute:
1079    case OMPD_distribute_simd:
1080    case OMPD_distribute_parallel_for:
1081    case OMPD_distribute_parallel_for_simd:
1082    case OMPD_teams_distribute:
1083    case OMPD_teams_distribute_simd:
1084    case OMPD_teams_distribute_parallel_for:
1085    case OMPD_teams_distribute_parallel_for_simd:
1086    case OMPD_target_update:
1087    case OMPD_declare_simd:
1088    case OMPD_declare_target:
1089    case OMPD_end_declare_target:
1090    case OMPD_declare_reduction:
1091    case OMPD_declare_mapper:
1092    case OMPD_taskloop:
1093    case OMPD_taskloop_simd:
1094    case OMPD_requires:
1095    case OMPD_unknown:
1096      llvm_unreachable("Unexpected directive.");
1097    }
1098  }
1099
1100  return false;
1101}
1102
1103/// Checks if the construct supports lightweight runtime. It must be SPMD
1104/// construct + inner loop-based construct with static scheduling.
1105static bool supportsLightweightRuntime(ASTContext &Ctx,
1106                                       const OMPExecutableDirective &D) {
1107  if (!supportsSPMDExecutionMode(CtxD))
1108    return false;
1109  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
1110  switch (DirectiveKind) {
1111  case OMPD_target:
1112  case OMPD_target_teams:
1113  case OMPD_target_parallel:
1114    return hasNestedLightweightDirective(CtxD);
1115  case OMPD_target_parallel_for:
1116  case OMPD_target_parallel_for_simd:
1117  case OMPD_target_teams_distribute_parallel_for:
1118  case OMPD_target_teams_distribute_parallel_for_simd:
1119    // (Last|First)-privates must be shared in parallel region.
1120    return hasStaticScheduling(D);
1121  case OMPD_target_simd:
1122  case OMPD_target_teams_distribute:
1123  case OMPD_target_teams_distribute_simd:
1124    return false;
1125  case OMPD_parallel:
1126  case OMPD_for:
1127  case OMPD_parallel_for:
1128  case OMPD_parallel_sections:
1129  case OMPD_for_simd:
1130  case OMPD_parallel_for_simd:
1131  case OMPD_cancel:
1132  case OMPD_cancellation_point:
1133  case OMPD_ordered:
1134  case OMPD_threadprivate:
1135  case OMPD_allocate:
1136  case OMPD_task:
1137  case OMPD_simd:
1138  case OMPD_sections:
1139  case OMPD_section:
1140  case OMPD_single:
1141  case OMPD_master:
1142  case OMPD_critical:
1143  case OMPD_taskyield:
1144  case OMPD_barrier:
1145  case OMPD_taskwait:
1146  case OMPD_taskgroup:
1147  case OMPD_atomic:
1148  case OMPD_flush:
1149  case OMPD_teams:
1150  case OMPD_target_data:
1151  case OMPD_target_exit_data:
1152  case OMPD_target_enter_data:
1153  case OMPD_distribute:
1154  case OMPD_distribute_simd:
1155  case OMPD_distribute_parallel_for:
1156  case OMPD_distribute_parallel_for_simd:
1157  case OMPD_teams_distribute:
1158  case OMPD_teams_distribute_simd:
1159  case OMPD_teams_distribute_parallel_for:
1160  case OMPD_teams_distribute_parallel_for_simd:
1161  case OMPD_target_update:
1162  case OMPD_declare_simd:
1163  case OMPD_declare_target:
1164  case OMPD_end_declare_target:
1165  case OMPD_declare_reduction:
1166  case OMPD_declare_mapper:
1167  case OMPD_taskloop:
1168  case OMPD_taskloop_simd:
1169  case OMPD_requires:
1170  case OMPD_unknown:
1171    break;
1172  }
1173  llvm_unreachable(
1174      "Unknown programming model for OpenMP directive on NVPTX target.");
1175}
1176
1177void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
1178                                             StringRef ParentName,
1179                                             llvm::Function *&OutlinedFn,
1180                                             llvm::Constant *&OutlinedFnID,
1181                                             bool IsOffloadEntry,
1182                                             const RegionCodeGenTy &CodeGen) {
1183  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1184  EntryFunctionState EST;
1185  WorkerFunctionState WST(CGMD.getBeginLoc());
1186  Work.clear();
1187  WrapperFunctionsMap.clear();
1188
1189  // Emit target region as a standalone region.
1190  class NVPTXPrePostActionTy : public PrePostActionTy {
1191    CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1192    CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
1193
1194  public:
1195    NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1196                         CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
1197        : EST(EST), WST(WST) {}
1198    void Enter(CodeGenFunction &CGF) override {
1199      auto &RT =
1200          static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1201      RT.emitNonSPMDEntryHeader(CGFESTWST);
1202      // Skip target region initialization.
1203      RT.setLocThreadIdInsertPt(CGF/*AtCurrentPoint=*/true);
1204    }
1205    void Exit(CodeGenFunction &CGF) override {
1206      auto &RT =
1207          static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1208      RT.clearLocThreadIdInsertPt(CGF);
1209      RT.emitNonSPMDEntryFooter(CGFEST);
1210    }
1211  } Action(ESTWST);
1212  CodeGen.setAction(Action);
1213  IsInTTDRegion = true;
1214  // Reserve place for the globalized memory.
1215  GlobalizedRecords.emplace_back();
1216  if (!KernelStaticGlobalized) {
1217    KernelStaticGlobalized = new llvm::GlobalVariable(
1218        CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1219        llvm::GlobalValue::InternalLinkage,
1220        llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1221        "_openmp_kernel_static_glob_rd$ptr"/*InsertBefore=*/nullptr,
1222        llvm::GlobalValue::NotThreadLocal,
1223        CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1224  }
1225  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1226                                   IsOffloadEntry, CodeGen);
1227  IsInTTDRegion = false;
1228
1229  // Now change the name of the worker function to correspond to this target
1230  // region's entry function.
1231  WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
1232
1233  // Create the worker function
1234  emitWorkerFunction(WST);
1235}
1236
1237// Setup NVPTX threads for master-worker OpenMP scheme.
1238void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1239                                                  EntryFunctionState &EST,
1240                                                  WorkerFunctionState &WST) {
1241  CGBuilderTy &Bld = CGF.Builder;
1242
1243  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1244  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1245  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1246  EST.ExitBB = CGF.createBasicBlock(".exit");
1247
1248  llvm::Value *IsWorker =
1249      Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
1250  Bld.CreateCondBr(IsWorkerWorkerBBMasterCheckBB);
1251
1252  CGF.EmitBlock(WorkerBB);
1253  emitCall(CGF, WST.Loc, WST.WorkerFn);
1254  CGF.EmitBranch(EST.ExitBB);
1255
1256  CGF.EmitBlock(MasterCheckBB);
1257  llvm::Value *IsMaster =
1258      Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1259  Bld.CreateCondBr(IsMasterMasterBBEST.ExitBB);
1260
1261  CGF.EmitBlock(MasterBB);
1262  IsInTargetMasterThreadRegion = true;
1263  // SEQUENTIAL (MASTER) REGION START
1264  // First action in sequential region:
1265  // Initialize the state of the OpenMP runtime library on the GPU.
1266  // TODO: Optimize runtime initialization and pass in correct value.
1267  llvm::Value *Args[] = {getThreadLimit(CGF),
1268                         Bld.getInt16(/*RequiresOMPRuntime=*/1)};
1269  CGF.EmitRuntimeCall(
1270      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
1271
1272  // For data sharing, we need to initialize the stack.
1273  CGF.EmitRuntimeCall(
1274      createNVPTXRuntimeFunction(
1275          OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
1276
1277  emitGenericVarsProlog(CGFWST.Loc);
1278}
1279
1280void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
1281                                                  EntryFunctionState &EST) {
1282  IsInTargetMasterThreadRegion = false;
1283  if (!CGF.HaveInsertPoint())
1284    return;
1285
1286  emitGenericVarsEpilog(CGF);
1287
1288  if (!EST.ExitBB)
1289    EST.ExitBB = CGF.createBasicBlock(".exit");
1290
1291  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1292  CGF.EmitBranch(TerminateBB);
1293
1294  CGF.EmitBlock(TerminateBB);
1295  // Signal termination condition.
1296  // TODO: Optimize runtime initialization and pass in correct value.
1297  llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
1298  CGF.EmitRuntimeCall(
1299      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
1300  // Barrier to terminate worker threads.
1301  syncCTAThreads(CGF);
1302  // Master thread jumps to exit point.
1303  CGF.EmitBranch(EST.ExitBB);
1304
1305  CGF.EmitBlock(EST.ExitBB);
1306  EST.ExitBB = nullptr;
1307}
1308
1309void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
1310                                          StringRef ParentName,
1311                                          llvm::Function *&OutlinedFn,
1312                                          llvm::Constant *&OutlinedFnID,
1313                                          bool IsOffloadEntry,
1314                                          const RegionCodeGenTy &CodeGen) {
1315  ExecutionRuntimeModesRAII ModeRAII(
1316      CurrentExecutionModeRequiresFullRuntime,
1317      CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1318          !supportsLightweightRuntime(CGM.getContext()D));
1319  EntryFunctionState EST;
1320
1321  // Emit target region as a standalone region.
1322  class NVPTXPrePostActionTy : public PrePostActionTy {
1323    CGOpenMPRuntimeNVPTX &RT;
1324    CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1325    const OMPExecutableDirective &D;
1326
1327  public:
1328    NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
1329                         CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1330                         const OMPExecutableDirective &D)
1331        : RT(RT), EST(EST), D(D) {}
1332    void Enter(CodeGenFunction &CGF) override {
1333      RT.emitSPMDEntryHeader(CGFESTD);
1334      // Skip target region initialization.
1335      RT.setLocThreadIdInsertPt(CGF/*AtCurrentPoint=*/true);
1336    }
1337    void Exit(CodeGenFunction &CGF) override {
1338      RT.clearLocThreadIdInsertPt(CGF);
1339      RT.emitSPMDEntryFooter(CGFEST);
1340    }
1341  } Action(*thisESTD);
1342  CodeGen.setAction(Action);
1343  IsInTTDRegion = true;
1344  // Reserve place for the globalized memory.
1345  GlobalizedRecords.emplace_back();
1346  if (!KernelStaticGlobalized) {
1347    KernelStaticGlobalized = new llvm::GlobalVariable(
1348        CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1349        llvm::GlobalValue::InternalLinkage,
1350        llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1351        "_openmp_kernel_static_glob_rd$ptr"/*InsertBefore=*/nullptr,
1352        llvm::GlobalValue::NotThreadLocal,
1353        CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1354  }
1355  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1356                                   IsOffloadEntry, CodeGen);
1357  IsInTTDRegion = false;
1358}
1359
1360void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
1361    CodeGenFunction &CGFEntryFunctionState &EST,
1362    const OMPExecutableDirective &D) {
1363  CGBuilderTy &Bld = CGF.Builder;
1364
1365  // Setup BBs in entry function.
1366  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1367  EST.ExitBB = CGF.createBasicBlock(".exit");
1368
1369  llvm::Value *Args[] = {getThreadLimit(CGF/*IsInSPMDExecutionMode=*/true),
1370                         /*RequiresOMPRuntime=*/
1371                         Bld.getInt16(RequiresFullRuntime ? 1 : 0),
1372                         /*RequiresDataSharing=*/Bld.getInt16(0)};
1373  CGF.EmitRuntimeCall(
1374      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
1375
1376  if (RequiresFullRuntime) {
1377    // For data sharing, we need to initialize the stack.
1378    CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1379        OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
1380  }
1381
1382  CGF.EmitBranch(ExecuteBB);
1383
1384  CGF.EmitBlock(ExecuteBB);
1385
1386  IsInTargetMasterThreadRegion = true;
1387}
1388
1389void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
1390                                               EntryFunctionState &EST) {
1391  IsInTargetMasterThreadRegion = false;
1392  if (!CGF.HaveInsertPoint())
1393    return;
1394
1395  if (!EST.ExitBB)
1396    EST.ExitBB = CGF.createBasicBlock(".exit");
1397
1398  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1399  CGF.EmitBranch(OMPDeInitBB);
1400
1401  CGF.EmitBlock(OMPDeInitBB);
1402  // DeInitialize the OMP state in the runtime; called by all active threads.
1403  llvm::Value *Args[] = {/*RequiresOMPRuntime=*/
1404                         CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
1405  CGF.EmitRuntimeCall(
1406      createNVPTXRuntimeFunction(
1407          OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2), Args);
1408  CGF.EmitBranch(EST.ExitBB);
1409
1410  CGF.EmitBlock(EST.ExitBB);
1411  EST.ExitBB = nullptr;
1412}
1413
1414// Create a unique global variable to indicate the execution mode of this target
1415// region. The execution mode is either 'generic', or 'spmd' depending on the
1416// target directive. This variable is picked up by the offload library to setup
1417// the device appropriately before kernel launch. If the execution mode is
1418// 'generic', the runtime reserves one warp for the master, otherwise, all
1419// warps participate in parallel work.
1420static void setPropertyExecutionMode(CodeGenModule &CGMStringRef Name,
1421                                     bool Mode) {
1422  auto *GVMode =
1423      new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1424                               llvm::GlobalValue::WeakAnyLinkage,
1425                               llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1426                               Twine(Name, "_exec_mode"));
1427  CGM.addCompilerUsedGlobal(GVMode);
1428}
1429
1430void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
1431  ASTContext &Ctx = CGM.getContext();
1432
1433  CodeGenFunction CGF(CGM/*suppressNewContext=*/true);
1434  CGF.StartFunction(GlobalDecl(), Ctx.VoidTyWST.WorkerFnWST.CGFI, {},
1435                    WST.LocWST.Loc);
1436  emitWorkerLoop(CGFWST);
1437  CGF.FinishFunction();
1438}
1439
1440void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
1441                                          WorkerFunctionState &WST) {
1442  //
1443  // The workers enter this loop and wait for parallel work from the master.
1444  // When the master encounters a parallel region it sets up the work + variable
1445  // arguments, and wakes up the workers.  The workers first check to see if
1446  // they are required for the parallel region, i.e., within the # of requested
1447  // parallel threads.  The activated workers load the variable arguments and
1448  // execute the parallel work.
1449  //
1450
1451  CGBuilderTy &Bld = CGF.Builder;
1452
1453  llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1454  llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1455  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1456  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1457  llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1458  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1459
1460  CGF.EmitBranch(AwaitBB);
1461
1462  // Workers wait for work from master.
1463  CGF.EmitBlock(AwaitBB);
1464  // Wait for parallel work
1465  syncCTAThreads(CGF);
1466
1467  Address WorkFn =
1468      CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy/*Name=*/"work_fn");
1469  Address ExecStatus =
1470      CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty/*Name=*/"exec_status");
1471  CGF.InitTempAlloca(ExecStatusBld.getInt8(/*C=*/0));
1472  CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1473
1474  // TODO: Optimize runtime initialization and pass in correct value.
1475  llvm::Value *Args[] = {WorkFn.getPointer(),
1476                         /*RequiresOMPRuntime=*/Bld.getInt16(1)};
1477  llvm::Value *Ret = CGF.EmitRuntimeCall(
1478      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
1479  Bld.CreateStore(Bld.CreateZExt(RetCGF.Int8Ty), ExecStatus);
1480
1481  // On termination condition (workid == 0), exit loop.
1482  llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1483  llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID"should_terminate");
1484  Bld.CreateCondBr(ShouldTerminateExitBBSelectWorkersBB);
1485
1486  // Activate requested workers.
1487  CGF.EmitBlock(SelectWorkersBB);
1488  llvm::Value *IsActive =
1489      Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1490  Bld.CreateCondBr(IsActiveExecuteBBBarrierBB);
1491
1492  // Signal start of parallel region.
1493  CGF.EmitBlock(ExecuteBB);
1494  // Skip initialization.
1495  setLocThreadIdInsertPt(CGF/*AtCurrentPoint=*/true);
1496
1497  // Process work items: outlined parallel functions.
1498  for (llvm::Function *W : Work) {
1499    // Try to match this outlined function.
1500    llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
1501
1502    llvm::Value *WorkFnMatch =
1503        Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1504
1505    llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1506    llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1507    Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1508
1509    // Execute this outlined function.
1510    CGF.EmitBlock(ExecuteFNBB);
1511
1512    // Insert call to work function via shared wrapper. The shared
1513    // wrapper takes two arguments:
1514    //   - the parallelism level;
1515    //   - the thread ID;
1516    emitCall(CGF, WST.Loc, W,
1517             {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1518
1519    // Go to end of parallel region.
1520    CGF.EmitBranch(TerminateBB);
1521
1522    CGF.EmitBlock(CheckNextBB);
1523  }
1524  // Default case: call to outlined function through pointer if the target
1525  // region makes a declare target call that may contain an orphaned parallel
1526  // directive.
1527  auto *ParallelFnTy =
1528      llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1529                              /*isVarArg=*/false);
1530  llvm::Value *WorkFnCast =
1531      Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo());
1532  // Insert call to work function via shared wrapper. The shared
1533  // wrapper takes two arguments:
1534  //   - the parallelism level;
1535  //   - the thread ID;
1536  emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast},
1537           {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1538  // Go to end of parallel region.
1539  CGF.EmitBranch(TerminateBB);
1540
1541  // Signal end of parallel region.
1542  CGF.EmitBlock(TerminateBB);
1543  CGF.EmitRuntimeCall(
1544      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
1545      llvm::None);
1546  CGF.EmitBranch(BarrierBB);
1547
1548  // All active and inactive workers wait at a barrier after parallel region.
1549  CGF.EmitBlock(BarrierBB);
1550  // Barrier after parallel region.
1551  syncCTAThreads(CGF);
1552  CGF.EmitBranch(AwaitBB);
1553
1554  // Exit target region.
1555  CGF.EmitBlock(ExitBB);
1556  // Skip initialization.
1557  clearLocThreadIdInsertPt(CGF);
1558}
1559
1560/// Returns specified OpenMP runtime function for the current OpenMP
1561/// implementation.  Specialized for the NVPTX device.
1562/// \param Function OpenMP runtime function.
1563/// \return Specified function.
1564llvm::FunctionCallee
1565CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
1566  llvm::FunctionCallee RTLFn = nullptr;
1567  switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
1568  case OMPRTL_NVPTX__kmpc_kernel_init: {
1569    // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
1570    // RequiresOMPRuntime);
1571    llvm::Type *TypeParams[] = {CGM.Int32TyCGM.Int16Ty};
1572    auto *FnTy =
1573        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1574    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
1575    break;
1576  }
1577  case OMPRTL_NVPTX__kmpc_kernel_deinit: {
1578    // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
1579    llvm::Type *TypeParams[] = {CGM.Int16Ty};
1580    auto *FnTy =
1581        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1582    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
1583    break;
1584  }
1585  case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
1586    // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
1587    // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
1588    llvm::Type *TypeParams[] = {CGM.Int32TyCGM.Int16TyCGM.Int16Ty};
1589    auto *FnTy =
1590        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1591    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
1592    break;
1593  }
1594  case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2: {
1595    // Build void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
1596    llvm::Type *TypeParams[] = {CGM.Int16Ty};
1597    auto *FnTy =
1598        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1599    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit_v2");
1600    break;
1601  }
1602  case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
1603    /// Build void __kmpc_kernel_prepare_parallel(
1604    /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
1605    llvm::Type *TypeParams[] = {CGM.Int8PtrTyCGM.Int16Ty};
1606    auto *FnTy =
1607        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1608    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
1609    break;
1610  }
1611  case OMPRTL_NVPTX__kmpc_kernel_parallel: {
1612    /// Build bool __kmpc_kernel_parallel(void **outlined_function,
1613    /// int16_t IsOMPRuntimeInitialized);
1614    llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTyCGM.Int16Ty};
1615    llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
1616    auto *FnTy =
1617        llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
1618    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
1619    break;
1620  }
1621  case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
1622    /// Build void __kmpc_kernel_end_parallel();
1623    auto *FnTy =
1624        llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1625    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
1626    break;
1627  }
1628  case OMPRTL_NVPTX__kmpc_serialized_parallel: {
1629    // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
1630    // global_tid);
1631    llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1632    auto *FnTy =
1633        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1634    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
1635    break;
1636  }
1637  case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
1638    // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
1639    // global_tid);
1640    llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1641    auto *FnTy =
1642        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1643    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
1644    break;
1645  }
1646  case OMPRTL_NVPTX__kmpc_shuffle_int32: {
1647    // Build int32_t __kmpc_shuffle_int32(int32_t element,
1648    // int16_t lane_offset, int16_t warp_size);
1649    llvm::Type *TypeParams[] = {CGM.Int32TyCGM.Int16TyCGM.Int16Ty};
1650    auto *FnTy =
1651        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
1652    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
1653    break;
1654  }
1655  case OMPRTL_NVPTX__kmpc_shuffle_int64: {
1656    // Build int64_t __kmpc_shuffle_int64(int64_t element,
1657    // int16_t lane_offset, int16_t warp_size);
1658    llvm::Type *TypeParams[] = {CGM.Int64TyCGM.Int16TyCGM.Int16Ty};
1659    auto *FnTy =
1660        llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
1661    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
1662    break;
1663  }
1664  case OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2: {
1665    // Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc,
1666    // kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void*
1667    // reduce_data, void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t
1668    // lane_id, int16_t lane_offset, int16_t Algorithm Version), void
1669    // (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1670    llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTyCGM.Int16Ty,
1671                                             CGM.Int16TyCGM.Int16Ty};
1672    auto *ShuffleReduceFnTy =
1673        llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1674                                /*isVarArg=*/false);
1675    llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTyCGM.Int32Ty};
1676    auto *InterWarpCopyFnTy =
1677        llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1678                                /*isVarArg=*/false);
1679    llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
1680                                CGM.Int32Ty,
1681                                CGM.Int32Ty,
1682                                CGM.SizeTy,
1683                                CGM.VoidPtrTy,
1684                                ShuffleReduceFnTy->getPointerTo(),
1685                                InterWarpCopyFnTy->getPointerTo()};
1686    auto *FnTy =
1687        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1688    RTLFn = CGM.CreateRuntimeFunction(
1689        FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait_v2");
1690    break;
1691  }
1692  case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1693    // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1694    llvm::Type *TypeParams[] = {CGM.Int32Ty};
1695    auto *FnTy =
1696        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1697    RTLFn = CGM.CreateRuntimeFunction(
1698        FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1699    break;
1700  }
1701  case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2: {
1702    // Build int32_t __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
1703    // global_tid, void *global_buffer, int32_t num_of_records, void*
1704    // reduce_data,
1705    // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1706    // lane_offset, int16_t shortCircuit),
1707    // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void
1708    // (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data),
1709    // void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx,
1710    // void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
1711    // int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void
1712    // *buffer, int idx, void *reduce_data));
1713    llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTyCGM.Int16Ty,
1714                                             CGM.Int16TyCGM.Int16Ty};
1715    auto *ShuffleReduceFnTy =
1716        llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1717                                /*isVarArg=*/false);
1718    llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTyCGM.Int32Ty};
1719    auto *InterWarpCopyFnTy =
1720        llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1721                                /*isVarArg=*/false);
1722    llvm::Type *GlobalListTypeParams[] = {CGM.VoidPtrTyCGM.IntTy,
1723                                          CGM.VoidPtrTy};
1724    auto *GlobalListFnTy =
1725        llvm::FunctionType::get(CGM.VoidTy, GlobalListTypeParams,
1726                                /*isVarArg=*/false);
1727    llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
1728                                CGM.Int32Ty,
1729                                CGM.VoidPtrTy,
1730                                CGM.Int32Ty,
1731                                CGM.VoidPtrTy,
1732                                ShuffleReduceFnTy->getPointerTo(),
1733                                InterWarpCopyFnTy->getPointerTo(),
1734                                GlobalListFnTy->getPointerTo(),
1735                                GlobalListFnTy->getPointerTo(),
1736                                GlobalListFnTy->getPointerTo(),
1737                                GlobalListFnTy->getPointerTo()};
1738    auto *FnTy =
1739        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1740    RTLFn = CGM.CreateRuntimeFunction(
1741        FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait_v2");
1742    break;
1743  }
1744  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1745    /// Build void __kmpc_data_sharing_init_stack();
1746    auto *FnTy =
1747        llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1748    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1749    break;
1750  }
1751  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
1752    /// Build void __kmpc_data_sharing_init_stack_spmd();
1753    auto *FnTy =
1754        llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1755    RTLFn =
1756        CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
1757    break;
1758  }
1759  case OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack: {
1760    // Build void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
1761    // int16_t UseSharedMemory);
1762    llvm::Type *TypeParams[] = {CGM.SizeTyCGM.Int16Ty};
1763    auto *FnTy =
1764        llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1765    RTLFn = CGM.CreateRuntimeFunction(
1766        FnTy, /*Name=*/"__kmpc_data_sharing_coalesced_push_stack");
1767    break;
1768  }
1769  case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1770    // Build void __kmpc_data_sharing_pop_stack(void *a);
1771    llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
1772    auto *FnTy =
1773        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1774    RTLFn = CGM.CreateRuntimeFunction(FnTy,
1775                                      /*Name=*/"__kmpc_data_sharing_pop_stack");
1776    break;
1777  }
1778  case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1779    /// Build void __kmpc_begin_sharing_variables(void ***args,
1780    /// size_t n_args);
1781    llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
1782    auto *FnTy =
1783        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1784    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1785    break;
1786  }
1787  case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1788    /// Build void __kmpc_end_sharing_variables();
1789    auto *FnTy =
1790        llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1791    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1792    break;
1793  }
1794  case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1795    /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1796    llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
1797    auto *FnTy =
1798        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1799    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1800    break;
1801  }
1802  case OMPRTL_NVPTX__kmpc_parallel_level: {
1803    // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
1804    llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1805    auto *FnTy =
1806        llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
1807    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
1808    break;
1809  }
1810  case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
1811    // Build int8_t __kmpc_is_spmd_exec_mode();
1812    auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false);
1813    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
1814    break;
1815  }
1816  case OMPRTL_NVPTX__kmpc_get_team_static_memory: {
1817    // Build void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
1818    // const void *buf, size_t size, int16_t is_shared, const void **res);
1819    llvm::Type *TypeParams[] = {CGM.Int16TyCGM.VoidPtrTyCGM.SizeTy,
1820                                CGM.Int16TyCGM.VoidPtrPtrTy};
1821    auto *FnTy =
1822        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1823    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory");
1824    break;
1825  }
1826  case OMPRTL_NVPTX__kmpc_restore_team_static_memory: {
1827    // Build void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
1828    // int16_t is_shared);
1829    llvm::Type *TypeParams[] = {CGM.Int16TyCGM.Int16Ty};
1830    auto *FnTy =
1831        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1832    RTLFn =
1833        CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory");
1834    break;
1835  }
1836  case OMPRTL__kmpc_barrier: {
1837    // Build void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
1838    llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1839    auto *FnTy =
1840        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1841    RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier");
1842    cast<llvm::Function>(RTLFn.getCallee())
1843        ->addFnAttr(llvm::Attribute::Convergent);
1844    break;
1845  }
1846  case OMPRTL__kmpc_barrier_simple_spmd: {
1847    // Build void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
1848    // global_tid);
1849    llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1850    auto *FnTy =
1851        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1852    RTLFn =
1853        CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier_simple_spmd");
1854    cast<llvm::Function>(RTLFn.getCallee())
1855        ->addFnAttr(llvm::Attribute::Convergent);
1856    break;
1857  }
1858  }
1859  return RTLFn;
1860}
1861
1862void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1863                                              llvm::Constant *Addr,
1864                                              uint64_t Sizeint32_t,
1865                                              llvm::GlobalValue::LinkageTypes) {
1866  // TODO: Add support for global variables on the device after declare target
1867  // support.
1868  if (!isa<llvm::Function>(Addr))
1869    return;
1870  llvm::Module &M = CGM.getModule();
1871  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1872
1873  // Get "nvvm.annotations" metadata node
1874  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1875
1876  llvm::Metadata *MDVals[] = {
1877      llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1878      llvm::ConstantAsMetadata::get(
1879          llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1880  // Append metadata to nvvm.annotations
1881  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1882}
1883
1884void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1885    const OMPExecutableDirective &DStringRef ParentName,
1886    llvm::Function *&OutlinedFnllvm::Constant *&OutlinedFnID,
1887    bool IsOffloadEntryconst RegionCodeGenTy &CodeGen) {
1888  if (!IsOffloadEntry// Nothing to do.
1889    return;
1890
1891   (0) . __assert_fail ("!ParentName.empty() && \"Invalid target region parent name!\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 1891, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(!ParentName.empty() && "Invalid target region parent name!");
1892
1893  bool Mode = supportsSPMDExecutionMode(CGM.getContext()D);
1894  if (Mode)
1895    emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1896                   CodeGen);
1897  else
1898    emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1899                      CodeGen);
1900
1901  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1902}
1903
1904namespace {
1905LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
1906/// Enum for accesseing the reserved_2 field of the ident_t struct.
1907enum ModeFlagsTy : unsigned {
1908  /// Bit set to 1 when in SPMD mode.
1909  KMP_IDENT_SPMD_MODE = 0x01,
1910  /// Bit set to 1 when a simplified runtime is used.
1911  KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1912  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
1913};
1914
1915/// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1916static const ModeFlagsTy UndefinedMode =
1917    (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1918// anonymous namespace
1919
1920unsigned CGOpenMPRuntimeNVPTX::getDefaultLocationReserved2Flags() const {
1921  switch (getExecutionMode()) {
1922  case EM_SPMD:
1923    if (requiresFullRuntime())
1924      return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1925    return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1926  case EM_NonSPMD:
1927     (0) . __assert_fail ("requiresFullRuntime() && \"Expected full runtime.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 1927, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(requiresFullRuntime() && "Expected full runtime.");
1928    return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1929  case EM_Unknown:
1930    return UndefinedMode;
1931  }
1932  llvm_unreachable("Unknown flags are requested.");
1933}
1934
1935CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
1936    : CGOpenMPRuntime(CGM, "_""$") {
1937  if (!CGM.getLangOpts().OpenMPIsDevice)
1938    llvm_unreachable("OpenMP NVPTX can only handle device code.");
1939}
1940
1941void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
1942                                              OpenMPProcBindClauseKind ProcBind,
1943                                              SourceLocation Loc) {
1944  // Do nothing in case of SPMD mode and L0 parallel.
1945  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1946    return;
1947
1948  CGOpenMPRuntime::emitProcBindClause(CGFProcBindLoc);
1949}
1950
1951void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
1952                                                llvm::Value *NumThreads,
1953                                                SourceLocation Loc) {
1954  // Do nothing in case of SPMD mode and L0 parallel.
1955  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1956    return;
1957
1958  CGOpenMPRuntime::emitNumThreadsClause(CGFNumThreadsLoc);
1959}
1960
1961void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
1962                                              const Expr *NumTeams,
1963                                              const Expr *ThreadLimit,
1964                                              SourceLocation Loc) {}
1965
1966llvm::Function *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
1967    const OMPExecutableDirective &Dconst VarDecl *ThreadIDVar,
1968    OpenMPDirectiveKind InnermostKindconst RegionCodeGenTy &CodeGen) {
1969  // Emit target region as a standalone region.
1970  class NVPTXPrePostActionTy : public PrePostActionTy {
1971    bool &IsInParallelRegion;
1972    bool PrevIsInParallelRegion;
1973
1974  public:
1975    NVPTXPrePostActionTy(bool &IsInParallelRegion)
1976        : IsInParallelRegion(IsInParallelRegion) {}
1977    void Enter(CodeGenFunction &CGF) override {
1978      PrevIsInParallelRegion = IsInParallelRegion;
1979      IsInParallelRegion = true;
1980    }
1981    void Exit(CodeGenFunction &CGF) override {
1982      IsInParallelRegion = PrevIsInParallelRegion;
1983    }
1984  } Action(IsInParallelRegion);
1985  CodeGen.setAction(Action);
1986  bool PrevIsInTTDRegion = IsInTTDRegion;
1987  IsInTTDRegion = false;
1988  bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1989  IsInTargetMasterThreadRegion = false;
1990  auto *OutlinedFun =
1991      cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1992          D, ThreadIDVar, InnermostKind, CodeGen));
1993  IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1994  IsInTTDRegion = PrevIsInTTDRegion;
1995  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
1996      !IsInParallelRegion) {
1997    llvm::Function *WrapperFun =
1998        createParallelDataSharingWrapper(OutlinedFun, D);
1999    WrapperFunctionsMap[OutlinedFun] = WrapperFun;
2000  }
2001
2002  return OutlinedFun;
2003}
2004
2005/// Get list of lastprivate variables from the teams distribute ... or
2006/// teams {distribute ...} directives.
2007static void
2008getDistributeLastprivateVars(ASTContext &Ctxconst OMPExecutableDirective &D,
2009                             llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
2010   (0) . __assert_fail ("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2011, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
2011 (0) . __assert_fail ("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2011, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">         "expected teams directive.");
2012  const OMPExecutableDirective *Dir = &D;
2013  if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
2014    if (const Stmt *S = getSingleCompoundChild(
2015            Ctx,
2016            D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
2017                /*IgnoreCaptured=*/true))) {
2018      Dir = dyn_cast<OMPExecutableDirective>(S);
2019      if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
2020        Dir = nullptr;
2021    }
2022  }
2023  if (!Dir)
2024    return;
2025  for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
2026    for (const Expr *E : C->getVarRefs())
2027      Vars.push_back(getPrivateItem(E));
2028  }
2029}
2030
2031/// Get list of reduction variables from the teams ... directives.
2032static void
2033getTeamsReductionVars(ASTContext &Ctxconst OMPExecutableDirective &D,
2034                      llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
2035   (0) . __assert_fail ("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2036, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
2036 (0) . __assert_fail ("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2036, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">         "expected teams directive.");
2037  for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
2038    for (const Expr *E : C->privates())
2039      Vars.push_back(getPrivateItem(E));
2040  }
2041}
2042
2043llvm::Function *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
2044    const OMPExecutableDirective &Dconst VarDecl *ThreadIDVar,
2045    OpenMPDirectiveKind InnermostKindconst RegionCodeGenTy &CodeGen) {
2046  SourceLocation Loc = D.getBeginLoc();
2047
2048  const RecordDecl *GlobalizedRD = nullptr;
2049  llvm::SmallVector<const ValueDecl *, 4LastPrivatesReductions;
2050  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
2051  // Globalize team reductions variable unconditionally in all modes.
2052  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2053    getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
2054  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
2055    getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
2056    if (!LastPrivatesReductions.empty()) {
2057      GlobalizedRD = ::buildRecordForGlobalizedVars(
2058          CGM.getContext(), llvm::None, LastPrivatesReductions,
2059          MappedDeclsFields, WarpSize);
2060    }
2061  } else if (!LastPrivatesReductions.empty()) {
2062     (0) . __assert_fail ("!TeamAndReductions.first && \"Previous team declaration is not expected.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2063, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(!TeamAndReductions.first &&
2063 (0) . __assert_fail ("!TeamAndReductions.first && \"Previous team declaration is not expected.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2063, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">           "Previous team declaration is not expected.");
2064    TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
2065    std::swap(TeamAndReductions.second, LastPrivatesReductions);
2066  }
2067
2068  // Emit target region as a standalone region.
2069  class NVPTXPrePostActionTy : public PrePostActionTy {
2070    SourceLocation &Loc;
2071    const RecordDecl *GlobalizedRD;
2072    llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2073        &MappedDeclsFields;
2074
2075  public:
2076    NVPTXPrePostActionTy(
2077        SourceLocation &Locconst RecordDecl *GlobalizedRD,
2078        llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2079            &MappedDeclsFields)
2080        : Loc(Loc), GlobalizedRD(GlobalizedRD),
2081          MappedDeclsFields(MappedDeclsFields) {}
2082    void Enter(CodeGenFunction &CGF) override {
2083      auto &Rt =
2084          static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
2085      if (GlobalizedRD) {
2086        auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2087        I->getSecond().GlobalRecord = GlobalizedRD;
2088        I->getSecond().MappedParams =
2089            llvm::make_unique<CodeGenFunction::OMPMapVars>();
2090        DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2091        for (const auto &Pair : MappedDeclsFields) {
2092           (0) . __assert_fail ("Pair.getFirst()->isCanonicalDecl() && \"Expected canonical declaration\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2093, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(Pair.getFirst()->isCanonicalDecl() &&
2093 (0) . __assert_fail ("Pair.getFirst()->isCanonicalDecl() && \"Expected canonical declaration\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2093, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">                 "Expected canonical declaration");
2094          Data.insert(std::make_pair(Pair.getFirst(),
2095                                     MappedVarData(Pair.getSecond(),
2096                                                   /*IsOnePerTeam=*/true)));
2097        }
2098      }
2099      Rt.emitGenericVarsProlog(CGFLoc);
2100    }
2101    void Exit(CodeGenFunction &CGF) override {
2102      static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
2103          .emitGenericVarsEpilog(CGF);
2104    }
2105  } Action(Loc, GlobalizedRD, MappedDeclsFields);
2106  CodeGen.setAction(Action);
2107  llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
2108      D, ThreadIDVar, InnermostKind, CodeGen);
2109  OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
2110  OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
2111  OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
2112
2113  return OutlinedFun;
2114}
2115
2116void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
2117                                                 SourceLocation Loc,
2118                                                 bool WithSPMDCheck) {
2119  if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
2120      getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2121    return;
2122
2123  CGBuilderTy &Bld = CGF.Builder;
2124
2125  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2126  if (I == FunctionGlobalizedDecls.end())
2127    return;
2128  if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
2129    QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
2130    QualType SecGlobalRecTy;
2131
2132    // Recover pointer to this function's global record. The runtime will
2133    // handle the specifics of the allocation of the memory.
2134    // Use actual memory size of the record including the padding
2135    // for alignment purposes.
2136    unsigned Alignment =
2137        CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2138    unsigned GlobalRecordSize =
2139        CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
2140    GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2141
2142    llvm::PointerType *GlobalRecPtrTy =
2143        CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
2144    llvm::Value *GlobalRecCastAddr;
2145    llvm::Value *IsTTD = nullptr;
2146    if (!IsInTTDRegion &&
2147        (WithSPMDCheck ||
2148         getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2149      llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2150      llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
2151      llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2152      if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
2153        llvm::Value *RTLoc = emitUpdateLocation(CGFLoc);
2154        llvm::Value *ThreadID = getThreadID(CGFLoc);
2155        llvm::Value *PL = CGF.EmitRuntimeCall(
2156            createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2157            {RTLoc, ThreadID});
2158        IsTTD = Bld.CreateIsNull(PL);
2159      }
2160      llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2161          createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2162      Bld.CreateCondBr(IsSPMDSPMDBBNonSPMDBB);
2163      // There is no need to emit line number for unconditional branch.
2164      (void)ApplyDebugLocation::CreateEmpty(CGF);
2165      CGF.EmitBlock(SPMDBB);
2166      Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
2167                               CharUnits::fromQuantity(Alignment));
2168      CGF.EmitBranch(ExitBB);
2169      // There is no need to emit line number for unconditional branch.
2170      (void)ApplyDebugLocation::CreateEmpty(CGF);
2171      CGF.EmitBlock(NonSPMDBB);
2172      llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
2173      if (const RecordDecl *SecGlobalizedVarsRecord =
2174              I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
2175        SecGlobalRecTy =
2176            CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
2177
2178        // Recover pointer to this function's global record. The runtime will
2179        // handle the specifics of the allocation of the memory.
2180        // Use actual memory size of the record including the padding
2181        // for alignment purposes.
2182        unsigned Alignment =
2183            CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
2184        unsigned GlobalRecordSize =
2185            CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
2186        GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2187        Size = Bld.CreateSelect(
2188            IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
2189      }
2190      // TODO: allow the usage of shared memory to be controlled by
2191      // the user, for now, default to global.
2192      llvm::Value *GlobalRecordSizeArg[] = {
2193          SizeCGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2194      llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2195          createNVPTXRuntimeFunction(
2196              OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2197          GlobalRecordSizeArg);
2198      GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2199          GlobalRecValueGlobalRecPtrTy);
2200      CGF.EmitBlock(ExitBB);
2201      auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
2202                                /*NumReservedValues=*/2"_select_stack");
2203      Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
2204      Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
2205      GlobalRecCastAddr = Phi;
2206      I->getSecond().GlobalRecordAddr = Phi;
2207      I->getSecond().IsInSPMDModeFlag = IsSPMD;
2208    } else if (IsInTTDRegion) {
2209       (0) . __assert_fail ("GlobalizedRecords.back().Records.size() < 2 && \"Expected less than 2 globalized records. one for target and one \" \"for teams.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2211, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(GlobalizedRecords.back().Records.size() < 2 &&
2210 (0) . __assert_fail ("GlobalizedRecords.back().Records.size() < 2 && \"Expected less than 2 globalized records. one for target and one \" \"for teams.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2211, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">             "Expected less than 2 globalized records: one for target and one "
2211 (0) . __assert_fail ("GlobalizedRecords.back().Records.size() < 2 && \"Expected less than 2 globalized records. one for target and one \" \"for teams.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2211, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">             "for teams.");
2212      unsigned Offset = 0;
2213      for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
2214        QualType RDTy = CGM.getContext().getRecordType(RD);
2215        unsigned Alignment =
2216            CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
2217        unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
2218        Offset =
2219            llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
2220      }
2221      unsigned Alignment =
2222          CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2223      Offset = llvm::alignTo(Offset, Alignment);
2224      GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
2225      ++GlobalizedRecords.back().RegionCounter;
2226      if (GlobalizedRecords.back().Records.size() == 1) {
2227         (0) . __assert_fail ("KernelStaticGlobalized && \"Kernel static pointer must be initialized already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2228, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(KernelStaticGlobalized &&
2228 (0) . __assert_fail ("KernelStaticGlobalized && \"Kernel static pointer must be initialized already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2228, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">               "Kernel static pointer must be initialized already.");
2229        auto *UseSharedMemory = new llvm::GlobalVariable(
2230            CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
2231            llvm::GlobalValue::InternalLinkage, nullptr,
2232            "_openmp_static_kernel$is_shared");
2233        UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2234        QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2235            /*DestWidth=*/16/*Signed=*/0);
2236        llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2237            Address(UseSharedMemory,
2238                    CGM.getContext().getTypeAlignInChars(Int16Ty)),
2239            /*Volatile=*/false, Int16Ty, Loc);
2240        auto *StaticGlobalized = new llvm::GlobalVariable(
2241            CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
2242            llvm::GlobalValue::CommonLinkage, nullptr);
2243        auto *RecSize = new llvm::GlobalVariable(
2244            CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
2245            llvm::GlobalValue::InternalLinkage, nullptr,
2246            "_openmp_static_kernel$size");
2247        RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2248        llvm::Value *Ld = CGF.EmitLoadOfScalar(
2249            Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
2250            CGM.getContext().getSizeType(), Loc);
2251        llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2252            KernelStaticGlobalizedCGM.VoidPtrPtrTy);
2253        llvm::Value *GlobalRecordSizeArg[] = {
2254            llvm::ConstantInt::get(
2255                CGM.Int16Ty,
2256                getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2257            StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
2258        CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2259                                OMPRTL_NVPTX__kmpc_get_team_static_memory),
2260                            GlobalRecordSizeArg);
2261        GlobalizedRecords.back().Buffer = StaticGlobalized;
2262        GlobalizedRecords.back().RecSize = RecSize;
2263        GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
2264        GlobalizedRecords.back().Loc = Loc;
2265      }
2266       (0) . __assert_fail ("KernelStaticGlobalized && \"Global address must be set already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2266, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(KernelStaticGlobalized && "Global address must be set already.");
2267      Address FrameAddr = CGF.EmitLoadOfPointer(
2268          Address(KernelStaticGlobalizedCGM.getPointerAlign()),
2269          CGM.getContext()
2270              .getPointerType(CGM.getContext().VoidPtrTy)
2271              .castAs<PointerType>());
2272      llvm::Value *GlobalRecValue =
2273          Bld.CreateConstInBoundsGEP(FrameAddrOffset).getPointer();
2274      I->getSecond().GlobalRecordAddr = GlobalRecValue;
2275      I->getSecond().IsInSPMDModeFlag = nullptr;
2276      GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2277          GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
2278    } else {
2279      // TODO: allow the usage of shared memory to be controlled by
2280      // the user, for now, default to global.
2281      llvm::Value *GlobalRecordSizeArg[] = {
2282          llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
2283          CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2284      llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2285          createNVPTXRuntimeFunction(
2286              OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2287          GlobalRecordSizeArg);
2288      GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2289          GlobalRecValueGlobalRecPtrTy);
2290      I->getSecond().GlobalRecordAddr = GlobalRecValue;
2291      I->getSecond().IsInSPMDModeFlag = nullptr;
2292    }
2293    LValue Base =
2294        CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddrGlobalRecTy);
2295
2296    // Emit the "global alloca" which is a GEP from the global declaration
2297    // record using the pointer returned by the runtime.
2298    LValue SecBase;
2299    decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
2300    if (IsTTD) {
2301      SecIt = I->getSecond().SecondaryLocalVarData->begin();
2302      llvm::PointerType *SecGlobalRecPtrTy =
2303          CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
2304      SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
2305          Bld.CreatePointerBitCastOrAddrSpaceCast(
2306              I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
2307          SecGlobalRecTy);
2308    }
2309    for (auto &Rec : I->getSecond().LocalVarData) {
2310      bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
2311      llvm::Value *ParValue;
2312      if (EscapedParam) {
2313        const auto *VD = cast<VarDecl>(Rec.first);
2314        LValue ParLVal =
2315            CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
2316        ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
2317      }
2318      LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
2319      // Emit VarAddr basing on lane-id if required.
2320      QualType VarTy;
2321      if (Rec.second.IsOnePerTeam) {
2322        VarTy = Rec.second.FD->getType();
2323      } else {
2324        llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
2325            VarAddr.getAddress().getPointer(),
2326            {Bld.getInt32(0), getNVPTXLaneID(CGF)});
2327        VarTy =
2328            Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
2329        VarAddr = CGF.MakeAddrLValue(
2330            Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
2331            AlignmentSource::Decl);
2332      }
2333      Rec.second.PrivateAddr = VarAddr.getAddress();
2334      if (!IsInTTDRegion &&
2335          (WithSPMDCheck ||
2336           getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2337         (0) . __assert_fail ("I->getSecond().IsInSPMDModeFlag && \"Expected unknown execution mode or required SPMD check.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2338, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(I->getSecond().IsInSPMDModeFlag &&
2338 (0) . __assert_fail ("I->getSecond().IsInSPMDModeFlag && \"Expected unknown execution mode or required SPMD check.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2338, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">               "Expected unknown execution mode or required SPMD check.");
2339        if (IsTTD) {
2340           (0) . __assert_fail ("SecIt->second.IsOnePerTeam && \"Secondary glob data must be one per team.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2341, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(SecIt->second.IsOnePerTeam &&
2341 (0) . __assert_fail ("SecIt->second.IsOnePerTeam && \"Secondary glob data must be one per team.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2341, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">                 "Secondary glob data must be one per team.");
2342          LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
2343          VarAddr.setAddress(
2344              Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(),
2345                                       VarAddr.getPointer()),
2346                      VarAddr.getAlignment()));
2347          Rec.second.PrivateAddr = VarAddr.getAddress();
2348        }
2349        Address GlobalPtr = Rec.second.PrivateAddr;
2350        Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
2351        Rec.second.PrivateAddr = Address(
2352            Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
2353                             LocalAddr.getPointer(), GlobalPtr.getPointer()),
2354            LocalAddr.getAlignment());
2355      }
2356      if (EscapedParam) {
2357        const auto *VD = cast<VarDecl>(Rec.first);
2358        CGF.EmitStoreOfScalar(ParValue, VarAddr);
2359        I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
2360      }
2361      if (IsTTD)
2362        ++SecIt;
2363    }
2364  }
2365  for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
2366    // Recover pointer to this function's global record. The runtime will
2367    // handle the specifics of the allocation of the memory.
2368    // Use actual memory size of the record including the padding
2369    // for alignment purposes.
2370    CGBuilderTy &Bld = CGF.Builder;
2371    llvm::Value *Size = CGF.getTypeSize(VD->getType());
2372    CharUnits Align = CGM.getContext().getDeclAlign(VD);
2373    Size = Bld.CreateNUWAdd(
2374        Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
2375    llvm::Value *AlignVal =
2376        llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
2377    Size = Bld.CreateUDiv(Size, AlignVal);
2378    Size = Bld.CreateNUWMul(Size, AlignVal);
2379    // TODO: allow the usage of shared memory to be controlled by
2380    // the user, for now, default to global.
2381    llvm::Value *GlobalRecordSizeArg[] = {
2382        Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2383    llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2384        createNVPTXRuntimeFunction(
2385            OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2386        GlobalRecordSizeArg);
2387    llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2388        GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
2389    LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
2390                                     CGM.getContext().getDeclAlign(VD),
2391                                     AlignmentSource::Decl);
2392    I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
2393                                            Base.getAddress());
2394    I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
2395  }
2396  I->getSecond().MappedParams->apply(CGF);
2397}
2398
2399void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
2400                                                 bool WithSPMDCheck) {
2401  if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
2402      getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2403    return;
2404
2405  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2406  if (I != FunctionGlobalizedDecls.end()) {
2407    I->getSecond().MappedParams->restore(CGF);
2408    if (!CGF.HaveInsertPoint())
2409      return;
2410    for (llvm::Value *Addr :
2411         llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2412      CGF.EmitRuntimeCall(
2413          createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2414          Addr);
2415    }
2416    if (I->getSecond().GlobalRecordAddr) {
2417      if (!IsInTTDRegion &&
2418          (WithSPMDCheck ||
2419           getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2420        CGBuilderTy &Bld = CGF.Builder;
2421        llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2422        llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2423        Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2424        // There is no need to emit line number for unconditional branch.
2425        (void)ApplyDebugLocation::CreateEmpty(CGF);
2426        CGF.EmitBlock(NonSPMDBB);
2427        CGF.EmitRuntimeCall(
2428            createNVPTXRuntimeFunction(
2429                OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2430            CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2431        CGF.EmitBlock(ExitBB);
2432      } else if (IsInTTDRegion) {
2433         0.") ? static_cast (0) . __assert_fail ("GlobalizedRecords.back().RegionCounter > 0 && \"region counter must be > 0.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2434, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(GlobalizedRecords.back().RegionCounter > 0 &&
2434 0.") ? static_cast (0) . __assert_fail ("GlobalizedRecords.back().RegionCounter > 0 && \"region counter must be > 0.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2434, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">               "region counter must be > 0.");
2435        --GlobalizedRecords.back().RegionCounter;
2436        // Emit the restore function only in the target region.
2437        if (GlobalizedRecords.back().RegionCounter == 0) {
2438          QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2439              /*DestWidth=*/16/*Signed=*/0);
2440          llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2441              Address(GlobalizedRecords.back().UseSharedMemory,
2442                      CGM.getContext().getTypeAlignInChars(Int16Ty)),
2443              /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
2444          llvm::Value *Args[] = {
2445              llvm::ConstantInt::get(
2446                  CGM.Int16Ty,
2447                  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2448              IsInSharedMemory};
2449          CGF.EmitRuntimeCall(
2450              createNVPTXRuntimeFunction(
2451                  OMPRTL_NVPTX__kmpc_restore_team_static_memory),
2452              Args);
2453        }
2454      } else {
2455        CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2456                                OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2457                            I->getSecond().GlobalRecordAddr);
2458      }
2459    }
2460  }
2461}
2462
2463void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
2464                                         const OMPExecutableDirective &D,
2465                                         SourceLocation Loc,
2466                                         llvm::Function *OutlinedFn,
2467                                         ArrayRef<llvm::Value *> CapturedVars) {
2468  if (!CGF.HaveInsertPoint())
2469    return;
2470
2471  Address ZeroAddr = CGF.CreateMemTemp(
2472      CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32/*Signed=*/1),
2473      /*Name*/ ".zero.addr");
2474  CGF.InitTempAlloca(ZeroAddrCGF.Builder.getInt32(/*C*/ 0));
2475  llvm::SmallVector<llvm::Value *, 16OutlinedFnArgs;
2476  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
2477  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2478  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2479  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2480}
2481
2482void CGOpenMPRuntimeNVPTX::emitParallelCall(
2483    CodeGenFunction &CGFSourceLocation Locllvm::Function *OutlinedFn,
2484    ArrayRef<llvm::Value *> CapturedVarsconst Expr *IfCond) {
2485  if (!CGF.HaveInsertPoint())
2486    return;
2487
2488  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
2489    emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2490  else
2491    emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2492}
2493
2494void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
2495    CodeGenFunction &CGFSourceLocation Locllvm::Value *OutlinedFn,
2496    ArrayRef<llvm::Value *> CapturedVarsconst Expr *IfCond) {
2497  llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
2498
2499  // Force inline this outlined function at its call site.
2500  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2501
2502  Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
2503                                           /*DestWidth=*/32/*Signed=*/1),
2504                                       ".zero.addr");
2505  CGF.InitTempAlloca(ZeroAddrCGF.Builder.getInt32(/*C*/ 0));
2506  // ThreadId for serialized parallels is 0.
2507  Address ThreadIDAddr = ZeroAddr;
2508  auto &&CodeGen = [thisFn, CapturedVars, LocZeroAddr, &ThreadIDAddr](
2509                       CodeGenFunction &CGFPrePostActionTy &Action) {
2510    Action.Enter(CGF);
2511
2512    llvm::SmallVector<llvm::Value *, 16OutlinedFnArgs;
2513    OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2514    OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2515    OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2516    emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
2517  };
2518  auto &&SeqGen = [this, &CodeGenLoc](CodeGenFunction &CGF,
2519                                        PrePostActionTy &) {
2520
2521    RegionCodeGenTy RCG(CodeGen);
2522    llvm::Value *RTLoc = emitUpdateLocation(CGFLoc);
2523    llvm::Value *ThreadID = getThreadID(CGFLoc);
2524    llvm::Value *Args[] = {RTLocThreadID};
2525
2526    NVPTXActionTy Action(
2527        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2528        Args,
2529        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2530        Args);
2531    RCG.setAction(Action);
2532    RCG(CGF);
2533  };
2534
2535  auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
2536                                                  PrePostActionTy &Action) {
2537    CGBuilderTy &Bld = CGF.Builder;
2538    llvm::Function *WFn = WrapperFunctionsMap[Fn];
2539     (0) . __assert_fail ("WFn && \"Wrapper function does not exist!\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2539, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(WFn && "Wrapper function does not exist!");
2540    llvm::Value *ID = Bld.CreateBitOrPointerCast(WFnCGM.Int8PtrTy);
2541
2542    // Prepare for parallel region. Indicate the outlined function.
2543    llvm::Value *Args[] = {ID/*RequiresOMPRuntime=*/Bld.getInt16(1)};
2544    CGF.EmitRuntimeCall(
2545        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
2546        Args);
2547
2548    // Create a private scope that will globalize the arguments
2549    // passed from the outside of the target region.
2550    CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2551
2552    // There's something to share.
2553    if (!CapturedVars.empty()) {
2554      // Prepare for parallel region. Indicate the outlined function.
2555      Address SharedArgs =
2556          CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy"shared_arg_refs");
2557      llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
2558
2559      llvm::Value *DataSharingArgs[] = {
2560          SharedArgsPtr,
2561          llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2562      CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2563                              OMPRTL_NVPTX__kmpc_begin_sharing_variables),
2564                          DataSharingArgs);
2565
2566      // Store variable address in a list of references to pass to workers.
2567      unsigned Idx = 0;
2568      ASTContext &Ctx = CGF.getContext();
2569      Address SharedArgListAddress = CGF.EmitLoadOfPointer(
2570          SharedArgsCtx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
2571                          .castAs<PointerType>());
2572      for (llvm::Value *V : CapturedVars) {
2573        Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
2574        llvm::Value *PtrV;
2575        if (V->getType()->isIntegerTy())
2576          PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2577        else
2578          PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
2579        CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2580                              Ctx.getPointerType(Ctx.VoidPtrTy));
2581        ++Idx;
2582      }
2583    }
2584
2585    // Activate workers. This barrier is used by the master to signal
2586    // work for the workers.
2587    syncCTAThreads(CGF);
2588
2589    // OpenMP [2.5, Parallel Construct, p.49]
2590    // There is an implied barrier at the end of a parallel region. After the
2591    // end of a parallel region, only the master thread of the team resumes
2592    // execution of the enclosing task region.
2593    //
2594    // The master waits at this barrier until all workers are done.
2595    syncCTAThreads(CGF);
2596
2597    if (!CapturedVars.empty())
2598      CGF.EmitRuntimeCall(
2599          createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
2600
2601    // Remember for post-processing in worker loop.
2602    Work.emplace_back(WFn);
2603  };
2604
2605  auto &&LNParallelGen = [thisLoc, &SeqGen, &L0ParallelGen](
2606                             CodeGenFunction &CGFPrePostActionTy &Action) {
2607    if (IsInParallelRegion) {
2608      SeqGen(CGFAction);
2609    } else if (IsInTargetMasterThreadRegion) {
2610      L0ParallelGen(CGFAction);
2611    } else {
2612      // Check for master and then parallelism:
2613      // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) {
2614      //   Serialized execution.
2615      // } else {
2616      //   Worker call.
2617      // }
2618      CGBuilderTy &Bld = CGF.Builder;
2619      llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2620      llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
2621      llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
2622      llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
2623      llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2624          createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2625      Bld.CreateCondBr(IsSPMDSeqBBParallelCheckBB);
2626      // There is no need to emit line number for unconditional branch.
2627      (void)ApplyDebugLocation::CreateEmpty(CGF);
2628      CGF.EmitBlock(ParallelCheckBB);
2629      llvm::Value *RTLoc = emitUpdateLocation(CGFLoc);
2630      llvm::Value *ThreadID = getThreadID(CGFLoc);
2631      llvm::Value *PL = CGF.EmitRuntimeCall(
2632          createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2633          {RTLoc, ThreadID});
2634      llvm::Value *Res = Bld.CreateIsNotNull(PL);
2635      Bld.CreateCondBr(ResSeqBBMasterBB);
2636      CGF.EmitBlock(SeqBB);
2637      SeqGen(CGFAction);
2638      CGF.EmitBranch(ExitBB);
2639      // There is no need to emit line number for unconditional branch.
2640      (void)ApplyDebugLocation::CreateEmpty(CGF);
2641      CGF.EmitBlock(MasterBB);
2642      L0ParallelGen(CGFAction);
2643      CGF.EmitBranch(ExitBB);
2644      // There is no need to emit line number for unconditional branch.
2645      (void)ApplyDebugLocation::CreateEmpty(CGF);
2646      // Emit the continuation block for code after the if.
2647      CGF.EmitBlock(ExitBB/*IsFinished=*/true);
2648    }
2649  };
2650
2651  if (IfCond) {
2652    emitOMPIfClause(CGFIfCondLNParallelGenSeqGen);
2653  } else {
2654    CodeGenFunction::RunCleanupsScope Scope(CGF);
2655    RegionCodeGenTy ThenRCG(LNParallelGen);
2656    ThenRCG(CGF);
2657  }
2658}
2659
2660void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
2661    CodeGenFunction &CGFSourceLocation Locllvm::Function *OutlinedFn,
2662    ArrayRef<llvm::Value *> CapturedVarsconst Expr *IfCond) {
2663  // Just call the outlined function to execute the parallel region.
2664  // OutlinedFn(&GTid, &zero, CapturedStruct);
2665  //
2666  llvm::SmallVector<llvm::Value *, 16OutlinedFnArgs;
2667
2668  Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
2669                                           /*DestWidth=*/32/*Signed=*/1),
2670                                       ".zero.addr");
2671  CGF.InitTempAlloca(ZeroAddrCGF.Builder.getInt32(/*C*/ 0));
2672  // ThreadId for serialized parallels is 0.
2673  Address ThreadIDAddr = ZeroAddr;
2674  auto &&CodeGen = [thisOutlinedFn, CapturedVars, LocZeroAddr,
2675                    &ThreadIDAddr](CodeGenFunction &CGF,
2676                                   PrePostActionTy &Action) {
2677    Action.Enter(CGF);
2678
2679    llvm::SmallVector<llvm::Value *, 16OutlinedFnArgs;
2680    OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2681    OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2682    OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2683    emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2684  };
2685  auto &&SeqGen = [this, &CodeGenLoc](CodeGenFunction &CGF,
2686                                        PrePostActionTy &) {
2687
2688    RegionCodeGenTy RCG(CodeGen);
2689    llvm::Value *RTLoc = emitUpdateLocation(CGFLoc);
2690    llvm::Value *ThreadID = getThreadID(CGFLoc);
2691    llvm::Value *Args[] = {RTLocThreadID};
2692
2693    NVPTXActionTy Action(
2694        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2695        Args,
2696        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2697        Args);
2698    RCG.setAction(Action);
2699    RCG(CGF);
2700  };
2701
2702  if (IsInTargetMasterThreadRegion) {
2703    // In the worker need to use the real thread id.
2704    ThreadIDAddr = emitThreadIDAddress(CGFLoc);
2705    RegionCodeGenTy RCG(CodeGen);
2706    RCG(CGF);
2707  } else {
2708    // If we are not in the target region, it is definitely L2 parallelism or
2709    // more, because for SPMD mode we always has L1 parallel level, sowe don't
2710    // need to check for orphaned directives.
2711    RegionCodeGenTy RCG(SeqGen);
2712    RCG(CGF);
2713  }
2714}
2715
2716void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
2717  // Always emit simple barriers!
2718  if (!CGF.HaveInsertPoint())
2719    return;
2720  // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
2721  // This function does not use parameters, so we can emit just default values.
2722  llvm::Value *Args[] = {
2723      llvm::ConstantPointerNull::get(
2724          cast<llvm::PointerType>(getIdentTyPointerTy())),
2725      llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0/*isSigned=*/true)};
2726  CGF.EmitRuntimeCall(
2727      createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args);
2728}
2729
2730void CGOpenMPRuntimeNVPTX::emitBarrierCall(CodeGenFunction &CGF,
2731                                           SourceLocation Loc,
2732                                           OpenMPDirectiveKind Kindbool,
2733                                           bool) {
2734  // Always emit simple barriers!
2735  if (!CGF.HaveInsertPoint())
2736    return;
2737  // Build call __kmpc_cancel_barrier(loc, thread_id);
2738  unsigned Flags = getDefaultFlagsForBarriers(Kind);
2739  llvm::Value *Args[] = {emitUpdateLocation(CGFLocFlags),
2740                         getThreadID(CGFLoc)};
2741  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier), Args);
2742}
2743
2744void CGOpenMPRuntimeNVPTX::emitCriticalRegion(
2745    CodeGenFunction &CGFStringRef CriticalName,
2746    const RegionCodeGenTy &CriticalOpGenSourceLocation Loc,
2747    const Expr *Hint) {
2748  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2749  llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2750  llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2751  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2752  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2753
2754  // Fetch team-local id of the thread.
2755  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2756
2757  // Get the width of the team.
2758  llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
2759
2760  // Initialize the counter variable for the loop.
2761  QualType Int32Ty =
2762      CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32/*Signed=*/0);
2763  Address Counter = CGF.CreateMemTemp(Int32Ty"critical_counter");
2764  LValue CounterLVal = CGF.MakeAddrLValue(CounterInt32Ty);
2765  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2766                        /*isInit=*/true);
2767
2768  // Block checks if loop counter exceeds upper bound.
2769  CGF.EmitBlock(LoopBB);
2770  llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLValLoc);
2771  llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterValTeamWidth);
2772  CGF.Builder.CreateCondBr(CmpLoopBoundTestBBExitBB);
2773
2774  // Block tests which single thread should execute region, and which threads
2775  // should go straight to synchronisation point.
2776  CGF.EmitBlock(TestBB);
2777  CounterVal = CGF.EmitLoadOfScalar(CounterLValLoc);
2778  llvm::Value *CmpThreadToCounter =
2779      CGF.Builder.CreateICmpEQ(ThreadIDCounterVal);
2780  CGF.Builder.CreateCondBr(CmpThreadToCounterBodyBBSyncBB);
2781
2782  // Block emits the body of the critical region.
2783  CGF.EmitBlock(BodyBB);
2784
2785  // Output the critical statement.
2786  CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
2787                                      Hint);
2788
2789  // After the body surrounded by the critical region, the single executing
2790  // thread will jump to the synchronisation point.
2791  // Block waits for all threads in current team to finish then increments the
2792  // counter variable and returns to the loop.
2793  CGF.EmitBlock(SyncBB);
2794  emitBarrierCall(CGFLocOMPD_unknown/*EmitChecks=*/false,
2795                  /*ForceSimpleCall=*/true);
2796
2797  llvm::Value *IncCounterVal =
2798      CGF.Builder.CreateNSWAdd(CounterValCGF.Builder.getInt32(1));
2799  CGF.EmitStoreOfScalar(IncCounterValCounterLVal);
2800  CGF.EmitBranch(LoopBB);
2801
2802  // Block that is reached when  all threads in the team complete the region.
2803  CGF.EmitBlock(ExitBB/*IsFinished=*/true);
2804}
2805
2806/// Cast value to the specified type.
2807static llvm::Value *castValueToType(CodeGenFunction &CGFllvm::Value *Val,
2808                                    QualType ValTyQualType CastTy,
2809                                    SourceLocation Loc) {
2810   (0) . __assert_fail ("!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && \"Cast type must sized.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2811, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2811 (0) . __assert_fail ("!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && \"Cast type must sized.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2811, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">         "Cast type must sized.");
2812   (0) . __assert_fail ("!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && \"Val type must sized.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2813, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2813 (0) . __assert_fail ("!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && \"Val type must sized.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2813, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">         "Val type must sized.");
2814  llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2815  if (ValTy == CastTy)
2816    return Val;
2817  if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2818      CGF.getContext().getTypeSizeInChars(CastTy))
2819    return CGF.Builder.CreateBitCast(ValLLVMCastTy);
2820  if (CastTy->isIntegerType() && ValTy->isIntegerType())
2821    return CGF.Builder.CreateIntCast(ValLLVMCastTy,
2822                                     CastTy->hasSignedIntegerRepresentation());
2823  Address CastItem = CGF.CreateMemTemp(CastTy);
2824  Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2825      CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2826  CGF.EmitStoreOfScalar(ValValCastItem/*Volatile=*/falseValTy);
2827  return CGF.EmitLoadOfScalar(CastItem/*Volatile=*/falseCastTyLoc);
2828}
2829
2830/// This function creates calls to one of two shuffle functions to copy
2831/// variables between lanes in a warp.
2832static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
2833                                                 llvm::Value *Elem,
2834                                                 QualType ElemType,
2835                                                 llvm::Value *Offset,
2836                                                 SourceLocation Loc) {
2837  CodeGenModule &CGM = CGF.CGM;
2838  CGBuilderTy &Bld = CGF.Builder;
2839  CGOpenMPRuntimeNVPTX &RT =
2840      *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
2841
2842  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2843   (0) . __assert_fail ("Size.getQuantity() <= 8 && \"Unsupported bitwidth in shuffle instruction.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2844, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(Size.getQuantity() <= 8 &&
2844 (0) . __assert_fail ("Size.getQuantity() <= 8 && \"Unsupported bitwidth in shuffle instruction.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 2844, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">         "Unsupported bitwidth in shuffle instruction.");
2845
2846  OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
2847                                         ? OMPRTL_NVPTX__kmpc_shuffle_int32
2848                                         : OMPRTL_NVPTX__kmpc_shuffle_int64;
2849
2850  // Cast all types to 32- or 64-bit values before calling shuffle routines.
2851  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2852      Size.getQuantity() <= 4 ? 32 : 64/*Signed=*/1);
2853  llvm::Value *ElemCast = castValueToType(CGFElemElemTypeCastTyLoc);
2854  llvm::Value *WarpSize =
2855      Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty/*isSigned=*/true);
2856
2857  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2858      RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCastOffsetWarpSize});
2859
2860  return castValueToType(CGFShuffledValCastTyElemTypeLoc);
2861}
2862
2863static void shuffleAndStore(CodeGenFunction &CGFAddress SrcAddr,
2864                            Address DestAddrQualType ElemType,
2865                            llvm::Value *OffsetSourceLocation Loc) {
2866  CGBuilderTy &Bld = CGF.Builder;
2867
2868  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2869  // Create the loop over the big sized data.
2870  // ptr = (void*)Elem;
2871  // ptrEnd = (void*) Elem + 1;
2872  // Step = 8;
2873  // while (ptr + Step < ptrEnd)
2874  //   shuffle((int64_t)*ptr);
2875  // Step = 4;
2876  // while (ptr + Step < ptrEnd)
2877  //   shuffle((int32_t)*ptr);
2878  // ...
2879  Address ElemPtr = DestAddr;
2880  Address Ptr = SrcAddr;
2881  Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
2882      Bld.CreateConstGEP(SrcAddr1), CGF.VoidPtrTy);
2883  for (int IntSize = 8IntSize >= 1IntSize /= 2) {
2884    if (Size < CharUnits::fromQuantity(IntSize))
2885      continue;
2886    QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2887        CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2888        /*Signed=*/1);
2889    llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2890    Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2891    ElemPtr =
2892        Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2893    if (Size.getQuantity() / IntSize > 1) {
2894      llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2895      llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2896      llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2897      llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2898      CGF.EmitBlock(PreCondBB);
2899      llvm::PHINode *PhiSrc =
2900          Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2901      PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2902      llvm::PHINode *PhiDest =
2903          Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2904      PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2905      Ptr = Address(PhiSrc, Ptr.getAlignment());
2906      ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2907      llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2908          PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
2909                                   Ptr.getPointer(), CGF.VoidPtrTy));
2910      Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiffBld.getInt64(IntSize - 1)),
2911                       ThenBBExitBB);
2912      CGF.EmitBlock(ThenBB);
2913      llvm::Value *Res = createRuntimeShuffleFunction(
2914          CGFCGF.EmitLoadOfScalar(Ptr/*Volatile=*/falseIntTypeLoc),
2915          IntTypeOffsetLoc);
2916      CGF.EmitStoreOfScalar(ResElemPtr/*Volatile=*/falseIntType);
2917      Address LocalPtr = Bld.CreateConstGEP(Ptr1);
2918      Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr1);
2919      PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
2920      PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
2921      CGF.EmitBranch(PreCondBB);
2922      CGF.EmitBlock(ExitBB);
2923    } else {
2924      llvm::Value *Res = createRuntimeShuffleFunction(
2925          CGFCGF.EmitLoadOfScalar(Ptr/*Volatile=*/falseIntTypeLoc),
2926          IntTypeOffsetLoc);
2927      CGF.EmitStoreOfScalar(ResElemPtr/*Volatile=*/falseIntType);
2928      Ptr = Bld.CreateConstGEP(Ptr1);
2929      ElemPtr = Bld.CreateConstGEP(ElemPtr1);
2930    }
2931    Size = Size % IntSize;
2932  }
2933}
2934
2935namespace {
2936enum CopyAction : unsigned {
2937  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2938  // the warp using shuffle instructions.
2939  RemoteLaneToThread,
2940  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2941  ThreadCopy,
2942  // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2943  ThreadToScratchpad,
2944  // ScratchpadToThread: Copy from a scratchpad array in global memory
2945  // containing team-reduced data to a thread's stack.
2946  ScratchpadToThread,
2947};
2948// namespace
2949
2950struct CopyOptionsTy {
2951  llvm::Value *RemoteLaneOffset;
2952  llvm::Value *ScratchpadIndex;
2953  llvm::Value *ScratchpadWidth;
2954};
2955
2956/// Emit instructions to copy a Reduce list, which contains partially
2957/// aggregated values, in the specified direction.
2958static void emitReductionListCopy(
2959    CopyAction ActionCodeGenFunction &CGFQualType ReductionArrayTy,
2960    ArrayRef<const Expr *> PrivatesAddress SrcBaseAddress DestBase,
2961    CopyOptionsTy CopyOptions = {nullptrnullptrnullptr}) {
2962
2963  CodeGenModule &CGM = CGF.CGM;
2964  ASTContext &C = CGM.getContext();
2965  CGBuilderTy &Bld = CGF.Builder;
2966
2967  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2968  llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2969  llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2970
2971  // Iterates, element-by-element, through the source Reduce list and
2972  // make a copy.
2973  unsigned Idx = 0;
2974  unsigned Size = Privates.size();
2975  for (const Expr *Private : Privates) {
2976    Address SrcElementAddr = Address::invalid();
2977    Address DestElementAddr = Address::invalid();
2978    Address DestElementPtrAddr = Address::invalid();
2979    // Should we shuffle in an element from a remote lane?
2980    bool ShuffleInElement = false;
2981    // Set to true to update the pointer in the dest Reduce list to a
2982    // newly created element.
2983    bool UpdateDestListPtr = false;
2984    // Increment the src or dest pointer to the scratchpad, for each
2985    // new element.
2986    bool IncrScratchpadSrc = false;
2987    bool IncrScratchpadDest = false;
2988
2989    switch (Action) {
2990    case RemoteLaneToThread: {
2991      // Step 1.1: Get the address for the src element in the Reduce list.
2992      Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2993      SrcElementAddr = CGF.EmitLoadOfPointer(
2994          SrcElementPtrAddr,
2995          C.getPointerType(Private->getType())->castAs<PointerType>());
2996
2997      // Step 1.2: Create a temporary to store the element in the destination
2998      // Reduce list.
2999      DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3000      DestElementAddr =
3001          CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
3002      ShuffleInElement = true;
3003      UpdateDestListPtr = true;
3004      break;
3005    }
3006    case ThreadCopy: {
3007      // Step 1.1: Get the address for the src element in the Reduce list.
3008      Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
3009      SrcElementAddr = CGF.EmitLoadOfPointer(
3010          SrcElementPtrAddr,
3011          C.getPointerType(Private->getType())->castAs<PointerType>());
3012
3013      // Step 1.2: Get the address for dest element.  The destination
3014      // element has already been created on the thread's stack.
3015      DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3016      DestElementAddr = CGF.EmitLoadOfPointer(
3017          DestElementPtrAddr,
3018          C.getPointerType(Private->getType())->castAs<PointerType>());
3019      break;
3020    }
3021    case ThreadToScratchpad: {
3022      // Step 1.1: Get the address for the src element in the Reduce list.
3023      Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
3024      SrcElementAddr = CGF.EmitLoadOfPointer(
3025          SrcElementPtrAddr,
3026          C.getPointerType(Private->getType())->castAs<PointerType>());
3027
3028      // Step 1.2: Get the address for dest element:
3029      // address = base + index * ElementSizeInChars.
3030      llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3031      llvm::Value *CurrentOffset =
3032          Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
3033      llvm::Value *ScratchPadElemAbsolutePtrVal =
3034          Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
3035      ScratchPadElemAbsolutePtrVal =
3036          Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
3037      DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
3038                                C.getTypeAlignInChars(Private->getType()));
3039      IncrScratchpadDest = true;
3040      break;
3041    }
3042    case ScratchpadToThread: {
3043      // Step 1.1: Get the address for the src element in the scratchpad.
3044      // address = base + index * ElementSizeInChars.
3045      llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3046      llvm::Value *CurrentOffset =
3047          Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
3048      llvm::Value *ScratchPadElemAbsolutePtrVal =
3049          Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
3050      ScratchPadElemAbsolutePtrVal =
3051          Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
3052      SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
3053                               C.getTypeAlignInChars(Private->getType()));
3054      IncrScratchpadSrc = true;
3055
3056      // Step 1.2: Create a temporary to store the element in the destination
3057      // Reduce list.
3058      DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3059      DestElementAddr =
3060          CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
3061      UpdateDestListPtr = true;
3062      break;
3063    }
3064    }
3065
3066    // Regardless of src and dest of copy, we emit the load of src
3067    // element as this is required in all directions
3068    SrcElementAddr = Bld.CreateElementBitCast(
3069        SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
3070    DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
3071                                               SrcElementAddr.getElementType());
3072
3073    // Now that all active lanes have read the element in the
3074    // Reduce list, shuffle over the value from the remote lane.
3075    if (ShuffleInElement) {
3076      shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
3077                      RemoteLaneOffset, Private->getExprLoc());
3078    } else {
3079      switch (CGF.getEvaluationKind(Private->getType())) {
3080      case TEK_Scalar: {
3081        llvm::Value *Elem =
3082            CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
3083                                 Private->getType(), Private->getExprLoc());
3084        // Store the source element value to the dest element address.
3085        CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
3086                              Private->getType());
3087        break;
3088      }
3089      case TEK_Complex: {
3090        CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
3091            CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3092            Private->getExprLoc());
3093        CGF.EmitStoreOfComplex(
3094            Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3095            /*isInit=*/false);
3096        break;
3097      }
3098      case TEK_Aggregate:
3099        CGF.EmitAggregateCopy(
3100            CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3101            CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3102            Private->getType(), AggValueSlot::DoesNotOverlap);
3103        break;
3104      }
3105    }
3106
3107    // Step 3.1: Modify reference in dest Reduce list as needed.
3108    // Modifying the reference in Reduce list to point to the newly
3109    // created element.  The element is live in the current function
3110    // scope and that of functions it invokes (i.e., reduce_function).
3111    // RemoteReduceData[i] = (void*)&RemoteElem
3112    if (UpdateDestListPtr) {
3113      CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
3114                                DestElementAddr.getPointer(), CGF.VoidPtrTy),
3115                            DestElementPtrAddr, /*Volatile=*/false,
3116                            C.VoidPtrTy);
3117    }
3118
3119    // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
3120    // address of the next element in scratchpad memory, unless we're currently
3121    // processing the last one.  Memory alignment is also taken care of here.
3122    if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
3123      llvm::Value *ScratchpadBasePtr =
3124          IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
3125      llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3126      ScratchpadBasePtr = Bld.CreateNUWAdd(
3127          ScratchpadBasePtr,
3128          Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
3129
3130      // Take care of global memory alignment for performance
3131      ScratchpadBasePtr = Bld.CreateNUWSub(
3132          ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3133      ScratchpadBasePtr = Bld.CreateUDiv(
3134          ScratchpadBasePtr,
3135          llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3136      ScratchpadBasePtr = Bld.CreateNUWAdd(
3137          ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3138      ScratchpadBasePtr = Bld.CreateNUWMul(
3139          ScratchpadBasePtr,
3140          llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3141
3142      if (IncrScratchpadDest)
3143        DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3144      else /* IncrScratchpadSrc = true */
3145        SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3146    }
3147
3148    ++Idx;
3149  }
3150}
3151
3152/// This function emits a helper that gathers Reduce lists from the first
3153/// lane of every active warp to lanes in the first warp.
3154///
3155/// void inter_warp_copy_func(void* reduce_data, num_warps)
3156///   shared smem[warp_size];
3157///   For all data entries D in reduce_data:
3158///     sync
3159///     If (I am the first lane in each warp)
3160///       Copy my local D to smem[warp_id]
3161///     sync
3162///     if (I am the first warp)
3163///       Copy smem[thread_id] to my local D
3164static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
3165                                              ArrayRef<const Expr *> Privates,
3166                                              QualType ReductionArrayTy,
3167                                              SourceLocation Loc) {
3168  ASTContext &C = CGM.getContext();
3169  llvm::Module &M = CGM.getModule();
3170
3171  // ReduceList: thread local Reduce list.
3172  // At the stage of the computation when this function is called, partially
3173  // aggregated values reside in the first lane of every active warp.
3174  ImplicitParamDecl ReduceListArg(C/*DC=*/nullptrLoc/*Id=*/nullptr,
3175                                  C.VoidPtrTyImplicitParamDecl::Other);
3176  // NumWarps: number of warps active in the parallel region.  This could
3177  // be smaller than 32 (max warps in a CTA) for partial block reduction.
3178  ImplicitParamDecl NumWarpsArg(C/*DC=*/nullptrLoc/*Id=*/nullptr,
3179                                C.getIntTypeForBitwidth(32/* Signed */ true),
3180                                ImplicitParamDecl::Other);
3181  FunctionArgList Args;
3182  Args.push_back(&ReduceListArg);
3183  Args.push_back(&NumWarpsArg);
3184
3185  const CGFunctionInfo &CGFI =
3186      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTyArgs);
3187  auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
3188                                    llvm::GlobalValue::InternalLinkage,
3189                                    "_omp_reduction_inter_warp_copy_func", &M);
3190  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3191  Fn->setDoesNotRecurse();
3192  CodeGenFunction CGF(CGM);
3193  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3194
3195  CGBuilderTy &Bld = CGF.Builder;
3196
3197  // This array is used as a medium to transfer, one reduce element at a time,
3198  // the data from the first lane of every warp to lanes in the first warp
3199  // in order to perform the final step of a reduction in a parallel region
3200  // (reduction across warps).  The array is placed in NVPTX __shared__ memory
3201  // for reduced latency, as well as to have a distinct copy for concurrently
3202  // executing target regions.  The array is declared with common linkage so
3203  // as to be shared across compilation units.
3204  StringRef TransferMediumName =
3205      "__openmp_nvptx_data_transfer_temporary_storage";
3206  llvm::GlobalVariable *TransferMedium =
3207      M.getGlobalVariable(TransferMediumName);
3208  if (!TransferMedium) {
3209    auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
3210    unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
3211    TransferMedium = new llvm::GlobalVariable(
3212        M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
3213        llvm::Constant::getNullValue(Ty), TransferMediumName,
3214        /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
3215        SharedAddressSpace);
3216    CGM.addCompilerUsedGlobal(TransferMedium);
3217  }
3218
3219  // Get the CUDA thread id of the current OpenMP thread on the GPU.
3220  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
3221  // nvptx_lane_id = nvptx_id % warpsize
3222  llvm::Value *LaneID = getNVPTXLaneID(CGF);
3223  // nvptx_warp_id = nvptx_id / warpsize
3224  llvm::Value *WarpID = getNVPTXWarpID(CGF);
3225
3226  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3227  Address LocalReduceList(
3228      Bld.CreatePointerBitCastOrAddrSpaceCast(
3229          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3230                               C.VoidPtrTy, Loc),
3231          CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3232      CGF.getPointerAlign());
3233
3234  unsigned Idx = 0;
3235  for (const Expr *Private : Privates) {
3236    //
3237    // Warp master copies reduce element to transfer medium in __shared__
3238    // memory.
3239    //
3240    unsigned RealTySize =
3241        C.getTypeSizeInChars(Private->getType())
3242            .alignTo(C.getTypeAlignInChars(Private->getType()))
3243            .getQuantity();
3244    for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
3245      unsigned NumIters = RealTySize / TySize;
3246      if (NumIters == 0)
3247        continue;
3248      QualType CType = C.getIntTypeForBitwidth(
3249          C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
3250      llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
3251      CharUnits Align = CharUnits::fromQuantity(TySize);
3252      llvm::Value *Cnt = nullptr;
3253      Address CntAddr = Address::invalid();
3254      llvm::BasicBlock *PrecondBB = nullptr;
3255      llvm::BasicBlock *ExitBB = nullptr;
3256      if (NumIters > 1) {
3257        CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
3258        CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
3259                              /*Volatile=*/false, C.IntTy);
3260        PrecondBB = CGF.createBasicBlock("precond");
3261        ExitBB = CGF.createBasicBlock("exit");
3262        llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
3263        // There is no need to emit line number for unconditional branch.
3264        (void)ApplyDebugLocation::CreateEmpty(CGF);
3265        CGF.EmitBlock(PrecondBB);
3266        Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
3267        llvm::Value *Cmp =
3268            Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
3269        Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
3270        CGF.EmitBlock(BodyBB);
3271      }
3272      // kmpc_barrier.
3273      CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3274                                             /*EmitChecks=*/false,
3275                                             /*ForceSimpleCall=*/true);
3276      llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3277      llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3278      llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3279
3280      // if (lane_id == 0)
3281      llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
3282      Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3283      CGF.EmitBlock(ThenBB);
3284
3285      // Reduce element = LocalReduceList[i]
3286      Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3287      llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3288          ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3289      // elemptr = ((CopyType*)(elemptrptr)) + I
3290      Address ElemPtr = Address(ElemPtrPtr, Align);
3291      ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
3292      if (NumIters > 1) {
3293        ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
3294                          ElemPtr.getAlignment());
3295      }
3296
3297      // Get pointer to location in transfer medium.
3298      // MediumPtr = &medium[warp_id]
3299      llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
3300          TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
3301      Address MediumPtr(MediumPtrVal, Align);
3302      // Casting to actual data type.
3303      // MediumPtr = (CopyType*)MediumPtrAddr;
3304      MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
3305
3306      // elem = *elemptr
3307      //*MediumPtr = elem
3308      llvm::Value *Elem =
3309          CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc);
3310      // Store the source element value to the dest element address.
3311      CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType);
3312
3313      Bld.CreateBr(MergeBB);
3314
3315      CGF.EmitBlock(ElseBB);
3316      Bld.CreateBr(MergeBB);
3317
3318      CGF.EmitBlock(MergeBB);
3319
3320      // kmpc_barrier.
3321      CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3322                                             /*EmitChecks=*/false,
3323                                             /*ForceSimpleCall=*/true);
3324
3325      //
3326      // Warp 0 copies reduce element from transfer medium.
3327      //
3328      llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
3329      llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
3330      llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
3331
3332      Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
3333      llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
3334          AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
3335
3336      // Up to 32 threads in warp 0 are active.
3337      llvm::Value *IsActiveThread =
3338          Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
3339      Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3340
3341      CGF.EmitBlock(W0ThenBB);
3342
3343      // SrcMediumPtr = &medium[tid]
3344      llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
3345          TransferMedium,
3346          {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
3347      Address SrcMediumPtr(SrcMediumPtrVal, Align);
3348      // SrcMediumVal = *SrcMediumPtr;
3349      SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
3350
3351      // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3352      Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3353      llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
3354          TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
3355      Address TargetElemPtr = Address(TargetElemPtrVal, Align);
3356      TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
3357      if (NumIters > 1) {
3358        TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
3359                                TargetElemPtr.getAlignment());
3360      }
3361
3362      // *TargetElemPtr = SrcMediumVal;
3363      llvm::Value *SrcMediumValue =
3364          CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
3365      CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
3366                            CType);
3367      Bld.CreateBr(W0MergeBB);
3368
3369      CGF.EmitBlock(W0ElseBB);
3370      Bld.CreateBr(W0MergeBB);
3371
3372      CGF.EmitBlock(W0MergeBB);
3373
3374      if (NumIters > 1) {
3375        Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
3376        CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
3377        CGF.EmitBranch(PrecondBB);
3378        (void)ApplyDebugLocation::CreateEmpty(CGF);
3379        CGF.EmitBlock(ExitBB);
3380      }
3381      RealTySize %= TySize;
3382    }
3383    ++Idx;
3384  }
3385
3386  CGF.FinishFunction();
3387  return Fn;
3388}
3389
3390/// Emit a helper that reduces data across two OpenMP threads (lanes)
3391/// in the same warp.  It uses shuffle instructions to copy over data from
3392/// a remote lane's stack.  The reduction algorithm performed is specified
3393/// by the fourth parameter.
3394///
3395/// Algorithm Versions.
3396/// Full Warp Reduce (argument value 0):
3397///   This algorithm assumes that all 32 lanes are active and gathers
3398///   data from these 32 lanes, producing a single resultant value.
3399/// Contiguous Partial Warp Reduce (argument value 1):
3400///   This algorithm assumes that only a *contiguous* subset of lanes
3401///   are active.  This happens for the last warp in a parallel region
3402///   when the user specified num_threads is not an integer multiple of
3403///   32.  This contiguous subset always starts with the zeroth lane.
3404/// Partial Warp Reduce (argument value 2):
3405///   This algorithm gathers data from any number of lanes at any position.
3406/// All reduced values are stored in the lowest possible lane.  The set
3407/// of problems every algorithm addresses is a super set of those
3408/// addressable by algorithms with a lower version number.  Overhead
3409/// increases as algorithm version increases.
3410///
3411/// Terminology
3412/// Reduce element:
3413///   Reduce element refers to the individual data field with primitive
3414///   data types to be combined and reduced across threads.
3415/// Reduce list:
3416///   Reduce list refers to a collection of local, thread-private
3417///   reduce elements.
3418/// Remote Reduce list:
3419///   Remote Reduce list refers to a collection of remote (relative to
3420///   the current thread) reduce elements.
3421///
3422/// We distinguish between three states of threads that are important to
3423/// the implementation of this function.
3424/// Alive threads:
3425///   Threads in a warp executing the SIMT instruction, as distinguished from
3426///   threads that are inactive due to divergent control flow.
3427/// Active threads:
3428///   The minimal set of threads that has to be alive upon entry to this
3429///   function.  The computation is correct iff active threads are alive.
3430///   Some threads are alive but they are not active because they do not
3431///   contribute to the computation in any useful manner.  Turning them off
3432///   may introduce control flow overheads without any tangible benefits.
3433/// Effective threads:
3434///   In order to comply with the argument requirements of the shuffle
3435///   function, we must keep all lanes holding data alive.  But at most
3436///   half of them perform value aggregation; we refer to this half of
3437///   threads as effective. The other half is simply handing off their
3438///   data.
3439///
3440/// Procedure
3441/// Value shuffle:
3442///   In this step active threads transfer data from higher lane positions
3443///   in the warp to lower lane positions, creating Remote Reduce list.
3444/// Value aggregation:
3445///   In this step, effective threads combine their thread local Reduce list
3446///   with Remote Reduce list and store the result in the thread local
3447///   Reduce list.
3448/// Value copy:
3449///   In this step, we deal with the assumption made by algorithm 2
3450///   (i.e. contiguity assumption).  When we have an odd number of lanes
3451///   active, say 2k+1, only k threads will be effective and therefore k
3452///   new values will be produced.  However, the Reduce list owned by the
3453///   (2k+1)th thread is ignored in the value aggregation.  Therefore
3454///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
3455///   that the contiguity assumption still holds.
3456static llvm::Function *emitShuffleAndReduceFunction(
3457    CodeGenModule &CGMArrayRef<const Expr *> Privates,
3458    QualType ReductionArrayTyllvm::Function *ReduceFnSourceLocation Loc) {
3459  ASTContext &C = CGM.getContext();
3460
3461  // Thread local Reduce list used to host the values of data to be reduced.
3462  ImplicitParamDecl ReduceListArg(C/*DC=*/nullptrLoc/*Id=*/nullptr,
3463                                  C.VoidPtrTyImplicitParamDecl::Other);
3464  // Current lane id; could be logical.
3465  ImplicitParamDecl LaneIDArg(C/*DC=*/nullptrLoc/*Id=*/nullptrC.ShortTy,
3466                              ImplicitParamDecl::Other);
3467  // Offset of the remote source lane relative to the current lane.
3468  ImplicitParamDecl RemoteLaneOffsetArg(C/*DC=*/nullptrLoc/*Id=*/nullptr,
3469                                        C.ShortTyImplicitParamDecl::Other);
3470  // Algorithm version.  This is expected to be known at compile time.
3471  ImplicitParamDecl AlgoVerArg(C/*DC=*/nullptrLoc/*Id=*/nullptr,
3472                               C.ShortTyImplicitParamDecl::Other);
3473  FunctionArgList Args;
3474  Args.push_back(&ReduceListArg);
3475  Args.push_back(&LaneIDArg);
3476  Args.push_back(&RemoteLaneOffsetArg);
3477  Args.push_back(&AlgoVerArg);
3478
3479  const CGFunctionInfo &CGFI =
3480      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTyArgs);
3481  auto *Fn = llvm::Function::Create(
3482      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3483      "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
3484  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3485  Fn->setDoesNotRecurse();
3486  CodeGenFunction CGF(CGM);
3487  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3488
3489  CGBuilderTy &Bld = CGF.Builder;
3490
3491  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3492  Address LocalReduceList(
3493      Bld.CreatePointerBitCastOrAddrSpaceCast(
3494          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3495                               C.VoidPtrTy, SourceLocation()),
3496          CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3497      CGF.getPointerAlign());
3498
3499  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
3500  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
3501      AddrLaneIDArg/*Volatile=*/falseC.ShortTySourceLocation());
3502
3503  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
3504  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
3505      AddrRemoteLaneOffsetArg/*Volatile=*/falseC.ShortTySourceLocation());
3506
3507  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
3508  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
3509      AddrAlgoVerArg/*Volatile=*/falseC.ShortTySourceLocation());
3510
3511  // Create a local thread-private variable to host the Reduce list
3512  // from a remote lane.
3513  Address RemoteReduceList =
3514      CGF.CreateMemTemp(ReductionArrayTy".omp.reduction.remote_reduce_list");
3515
3516  // This loop iterates through the list of reduce elements and copies,
3517  // element by element, from a remote lane in the warp to RemoteReduceList,
3518  // hosted on the thread's stack.
3519  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3520                        LocalReduceList, RemoteReduceList,
3521                        {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3522                         /*ScratchpadIndex=*/nullptr,
3523                         /*ScratchpadWidth=*/nullptr});
3524
3525  // The actions to be performed on the Remote Reduce list is dependent
3526  // on the algorithm version.
3527  //
3528  //  if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3529  //  LaneId % 2 == 0 && Offset > 0):
3530  //    do the reduction value aggregation
3531  //
3532  //  The thread local variable Reduce list is mutated in place to host the
3533  //  reduced data, which is the aggregated value produced from local and
3534  //  remote lanes.
3535  //
3536  //  Note that AlgoVer is expected to be a constant integer known at compile
3537  //  time.
3538  //  When AlgoVer==0, the first conjunction evaluates to true, making
3539  //    the entire predicate true during compile time.
3540  //  When AlgoVer==1, the second conjunction has only the second part to be
3541  //    evaluated during runtime.  Other conjunctions evaluates to false
3542  //    during compile time.
3543  //  When AlgoVer==2, the third conjunction has only the second part to be
3544  //    evaluated during runtime.  Other conjunctions evaluates to false
3545  //    during compile time.
3546  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
3547
3548  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgValBld.getInt16(1));
3549  llvm::Value *CondAlgo1 = Bld.CreateAnd(
3550      Algo1Bld.CreateICmpULT(LaneIDArgValRemoteLaneOffsetArgVal));
3551
3552  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgValBld.getInt16(2));
3553  llvm::Value *CondAlgo2 = Bld.CreateAnd(
3554      Algo2Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgValBld.getInt16(1))));
3555  CondAlgo2 = Bld.CreateAnd(
3556      CondAlgo2Bld.CreateICmpSGT(RemoteLaneOffsetArgValBld.getInt16(0)));
3557
3558  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0CondAlgo1);
3559  CondReduce = Bld.CreateOr(CondReduceCondAlgo2);
3560
3561  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3562  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3563  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3564  Bld.CreateCondBr(CondReduceThenBBElseBB);
3565
3566  CGF.EmitBlock(ThenBB);
3567  // reduce_function(LocalReduceList, RemoteReduceList)
3568  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3569      LocalReduceList.getPointer(), CGF.VoidPtrTy);
3570  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3571      RemoteReduceList.getPointer(), CGF.VoidPtrTy);
3572  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3573      CGFLocReduceFn, {LocalReduceListPtrRemoteReduceListPtr});
3574  Bld.CreateBr(MergeBB);
3575
3576  CGF.EmitBlock(ElseBB);
3577  Bld.CreateBr(MergeBB);
3578
3579  CGF.EmitBlock(MergeBB);
3580
3581  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3582  // Reduce list.
3583  Algo1 = Bld.CreateICmpEQ(AlgoVerArgValBld.getInt16(1));
3584  llvm::Value *CondCopy = Bld.CreateAnd(
3585      Algo1Bld.CreateICmpUGE(LaneIDArgValRemoteLaneOffsetArgVal));
3586
3587  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3588  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3589  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3590  Bld.CreateCondBr(CondCopyCpyThenBBCpyElseBB);
3591
3592  CGF.EmitBlock(CpyThenBB);
3593  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3594                        RemoteReduceList, LocalReduceList);
3595  Bld.CreateBr(CpyMergeBB);
3596
3597  CGF.EmitBlock(CpyElseBB);
3598  Bld.CreateBr(CpyMergeBB);
3599
3600  CGF.EmitBlock(CpyMergeBB);
3601
3602  CGF.FinishFunction();
3603  return Fn;
3604}
3605
3606/// This function emits a helper that copies all the reduction variables from
3607/// the team into the provided global buffer for the reduction variables.
3608///
3609/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3610///   For all data entries D in reduce_data:
3611///     Copy local D to buffer.D[Idx]
3612static llvm::Value *emitListToGlobalCopyFunction(
3613    CodeGenModule &CGMArrayRef<const Expr *> Privates,
3614    QualType ReductionArrayTySourceLocation Loc,
3615    const RecordDecl *TeamReductionRec,
3616    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3617        &VarFieldMap) {
3618  ASTContext &C = CGM.getContext();
3619
3620  // Buffer: global reduction buffer.
3621  ImplicitParamDecl BufferArg(C/*DC=*/nullptrLoc/*Id=*/nullptr,
3622                              C.VoidPtrTyImplicitParamDecl::Other);
3623  // Idx: index of the buffer.
3624  ImplicitParamDecl IdxArg(C/*DC=*/nullptrLoc/*Id=*/nullptrC.IntTy,
3625                           ImplicitParamDecl::Other);
3626  // ReduceList: thread local Reduce list.
3627  ImplicitParamDecl ReduceListArg(C/*DC=*/nullptrLoc/*Id=*/nullptr,
3628                                  C.VoidPtrTyImplicitParamDecl::Other);
3629  FunctionArgList Args;
3630  Args.push_back(&BufferArg);
3631  Args.push_back(&IdxArg);
3632  Args.push_back(&ReduceListArg);
3633
3634  const CGFunctionInfo &CGFI =
3635      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTyArgs);
3636  auto *Fn = llvm::Function::Create(
3637      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3638      "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
3639  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3640  Fn->setDoesNotRecurse();
3641  CodeGenFunction CGF(CGM);
3642  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3643
3644  CGBuilderTy &Bld = CGF.Builder;
3645
3646  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3647  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3648  Address LocalReduceList(
3649      Bld.CreatePointerBitCastOrAddrSpaceCast(
3650          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3651                               C.VoidPtrTy, Loc),
3652          CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3653      CGF.getPointerAlign());
3654  QualType StaticTy = C.getRecordType(TeamReductionRec);
3655  llvm::Type *LLVMReductionsBufferTy =
3656      CGM.getTypes().ConvertTypeForMem(StaticTy);
3657  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3658      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3659      LLVMReductionsBufferTy->getPointerTo());
3660  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3661                         CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3662                                              /*Volatile=*/false, C.IntTy,
3663                                              Loc)};
3664  unsigned Idx = 0;
3665  for (const Expr *Private : Privates) {
3666    // Reduce element = LocalReduceList[i]
3667    Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3668    llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3669        ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3670    // elemptr = ((CopyType*)(elemptrptr)) + I
3671    ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3672        ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3673    Address ElemPtr =
3674        Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3675    const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3676    // Global = Buffer.VD[Idx];
3677    const FieldDecl *FD = VarFieldMap.lookup(VD);
3678    LValue GlobLVal = CGF.EmitLValueForField(
3679        CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3680    llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3681    GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
3682    switch (CGF.getEvaluationKind(Private->getType())) {
3683    case TEK_Scalar: {
3684      llvm::Value *V = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
3685                                            Private->getType(), Loc);
3686      CGF.EmitStoreOfScalar(V, GlobLVal);
3687      break;
3688    }
3689    case TEK_Complex: {
3690      CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
3691          CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
3692      CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
3693      break;
3694    }
3695    case TEK_Aggregate:
3696      CGF.EmitAggregateCopy(GlobLVal,
3697                            CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3698                            Private->getType(), AggValueSlot::DoesNotOverlap);
3699      break;
3700    }
3701    ++Idx;
3702  }
3703
3704  CGF.FinishFunction();
3705  return Fn;
3706}
3707
3708/// This function emits a helper that reduces all the reduction variables from
3709/// the team into the provided global buffer for the reduction variables.
3710///
3711/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
3712///  void *GlobPtrs[];
3713///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
3714///  ...
3715///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
3716///  reduce_function(GlobPtrs, reduce_data);
3717static llvm::Value *emitListToGlobalReduceFunction(
3718    CodeGenModule &CGMArrayRef<const Expr *> Privates,
3719    QualType ReductionArrayTySourceLocation Loc,
3720    const RecordDecl *TeamReductionRec,
3721    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3722        &VarFieldMap,
3723    llvm::Function *ReduceFn) {
3724  ASTContext &C = CGM.getContext();
3725
3726  // Buffer: global reduction buffer.
3727  ImplicitParamDecl BufferArg(C/*DC=*/nullptrLoc/*Id=*/nullptr,
3728                              C.VoidPtrTyImplicitParamDecl::Other);
3729  // Idx: index of the buffer.
3730  ImplicitParamDecl IdxArg(C/*DC=*/nullptrLoc/*Id=*/nullptrC.IntTy,
3731                           ImplicitParamDecl::Other);
3732  // ReduceList: thread local Reduce list.
3733  ImplicitParamDecl ReduceListArg(C/*DC=*/nullptrLoc/*Id=*/nullptr,
3734                                  C.VoidPtrTyImplicitParamDecl::Other);
3735  FunctionArgList Args;
3736  Args.push_back(&BufferArg);
3737  Args.push_back(&IdxArg);
3738  Args.push_back(&ReduceListArg);
3739
3740  const CGFunctionInfo &CGFI =
3741      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTyArgs);
3742  auto *Fn = llvm::Function::Create(
3743      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3744      "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
3745  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3746  Fn->setDoesNotRecurse();
3747  CodeGenFunction CGF(CGM);
3748  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3749
3750  CGBuilderTy &Bld = CGF.Builder;
3751
3752  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3753  QualType StaticTy = C.getRecordType(TeamReductionRec);
3754  llvm::Type *LLVMReductionsBufferTy =
3755      CGM.getTypes().ConvertTypeForMem(StaticTy);
3756  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3757      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3758      LLVMReductionsBufferTy->getPointerTo());
3759
3760  // 1. Build a list of reduction variables.
3761  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3762  Address ReductionList =
3763      CGF.CreateMemTemp(ReductionArrayTy".omp.reduction.red_list");
3764  auto IPriv = Privates.begin();
3765  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3766                         CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3767                                              /*Volatile=*/false, C.IntTy,
3768                                              Loc)};
3769  unsigned Idx = 0;
3770  for (unsigned I = 0E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3771    Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionListIdx);
3772    // Global = Buffer.VD[Idx];
3773    const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3774    const FieldDecl *FD = VarFieldMap.lookup(VD);
3775    LValue GlobLVal = CGF.EmitLValueForField(
3776        CGF.MakeNaturalAlignAddrLValue(BufferArrPtrStaticTy), FD);
3777    llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3778    llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3779    CGF.EmitStoreOfScalar(PtrElem/*Volatile=*/falseC.VoidPtrTy);
3780    if ((*IPriv)->getType()->isVariablyModifiedType()) {
3781      // Store array size.
3782      ++Idx;
3783      Elem = CGF.Builder.CreateConstArrayGEP(ReductionListIdx);
3784      llvm::Value *Size = CGF.Builder.CreateIntCast(
3785          CGF.getVLASize(
3786                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3787              .NumElts,
3788          CGF.SizeTy, /*isSigned=*/false);
3789      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(SizeCGF.VoidPtrTy),
3790                              Elem);
3791    }
3792  }
3793
3794  // Call reduce_function(GlobalReduceList, ReduceList)
3795  llvm::Value *GlobalReduceList =
3796      CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3797  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3798  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3799      AddrReduceListArg/*Volatile=*/falseC.VoidPtrTyLoc);
3800  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3801      CGFLocReduceFn, {GlobalReduceListReducedPtr});
3802  CGF.FinishFunction();
3803  return Fn;
3804}
3805
3806/// This function emits a helper that copies all the reduction variables from
3807/// the team into the provided global buffer for the reduction variables.
3808///
3809/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3810///   For all data entries D in reduce_data:
3811///     Copy buffer.D[Idx] to local D;
3812static llvm::Value *emitGlobalToListCopyFunction(
3813    CodeGenModule &CGMArrayRef<const Expr *> Privates,
3814    QualType ReductionArrayTySourceLocation Loc,
3815    const RecordDecl *TeamReductionRec,
3816    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3817        &VarFieldMap) {
3818  ASTContext &C = CGM.getContext();
3819
3820  // Buffer: global reduction buffer.
3821  ImplicitParamDecl BufferArg(C/*DC=*/nullptrLoc/*Id=*/nullptr,
3822                              C.VoidPtrTyImplicitParamDecl::Other);
3823  // Idx: index of the buffer.
3824  ImplicitParamDecl IdxArg(C/*DC=*/nullptrLoc/*Id=*/nullptrC.IntTy,
3825                           ImplicitParamDecl::Other);
3826  // ReduceList: thread local Reduce list.
3827  ImplicitParamDecl ReduceListArg(C/*DC=*/nullptrLoc/*Id=*/nullptr,
3828                                  C.VoidPtrTyImplicitParamDecl::Other);
3829  FunctionArgList Args;
3830  Args.push_back(&BufferArg);
3831  Args.push_back(&IdxArg);
3832  Args.push_back(&ReduceListArg);
3833
3834  const CGFunctionInfo &CGFI =
3835      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTyArgs);
3836  auto *Fn = llvm::Function::Create(
3837      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3838      "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
3839  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3840  Fn->setDoesNotRecurse();
3841  CodeGenFunction CGF(CGM);
3842  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3843
3844  CGBuilderTy &Bld = CGF.Builder;
3845
3846  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3847  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3848  Address LocalReduceList(
3849      Bld.CreatePointerBitCastOrAddrSpaceCast(
3850          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3851                               C.VoidPtrTy, Loc),
3852          CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3853      CGF.getPointerAlign());
3854  QualType StaticTy = C.getRecordType(TeamReductionRec);
3855  llvm::Type *LLVMReductionsBufferTy =
3856      CGM.getTypes().ConvertTypeForMem(StaticTy);
3857  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3858      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3859      LLVMReductionsBufferTy->getPointerTo());
3860
3861  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3862                         CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3863                                              /*Volatile=*/false, C.IntTy,
3864                                              Loc)};
3865  unsigned Idx = 0;
3866  for (const Expr *Private : Privates) {
3867    // Reduce element = LocalReduceList[i]
3868    Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3869    llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3870        ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3871    // elemptr = ((CopyType*)(elemptrptr)) + I
3872    ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3873        ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3874    Address ElemPtr =
3875        Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3876    const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3877    // Global = Buffer.VD[Idx];
3878    const FieldDecl *FD = VarFieldMap.lookup(VD);
3879    LValue GlobLVal = CGF.EmitLValueForField(
3880        CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3881    llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3882    GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
3883    switch (CGF.getEvaluationKind(Private->getType())) {
3884    case TEK_Scalar: {
3885      llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
3886      CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType());
3887      break;
3888    }
3889    case TEK_Complex: {
3890      CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
3891      CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3892                             /*isInit=*/false);
3893      break;
3894    }
3895    case TEK_Aggregate:
3896      CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3897                            GlobLVal, Private->getType(),
3898                            AggValueSlot::DoesNotOverlap);
3899      break;
3900    }
3901    ++Idx;
3902  }
3903
3904  CGF.FinishFunction();
3905  return Fn;
3906}
3907
3908/// This function emits a helper that reduces all the reduction variables from
3909/// the team into the provided global buffer for the reduction variables.
3910///
3911/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
3912///  void *GlobPtrs[];
3913///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
3914///  ...
3915///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
3916///  reduce_function(reduce_data, GlobPtrs);
3917static llvm::Value *emitGlobalToListReduceFunction(
3918    CodeGenModule &CGMArrayRef<const Expr *> Privates,
3919    QualType ReductionArrayTySourceLocation Loc,
3920    const RecordDecl *TeamReductionRec,
3921    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3922        &VarFieldMap,
3923    llvm::Function *ReduceFn) {
3924  ASTContext &C = CGM.getContext();
3925
3926  // Buffer: global reduction buffer.
3927  ImplicitParamDecl BufferArg(C/*DC=*/nullptrLoc/*Id=*/nullptr,
3928                              C.VoidPtrTyImplicitParamDecl::Other);
3929  // Idx: index of the buffer.
3930  ImplicitParamDecl IdxArg(C/*DC=*/nullptrLoc/*Id=*/nullptrC.IntTy,
3931                           ImplicitParamDecl::Other);
3932  // ReduceList: thread local Reduce list.
3933  ImplicitParamDecl ReduceListArg(C/*DC=*/nullptrLoc/*Id=*/nullptr,
3934                                  C.VoidPtrTyImplicitParamDecl::Other);
3935  FunctionArgList Args;
3936  Args.push_back(&BufferArg);
3937  Args.push_back(&IdxArg);
3938  Args.push_back(&ReduceListArg);
3939
3940  const CGFunctionInfo &CGFI =
3941      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTyArgs);
3942  auto *Fn = llvm::Function::Create(
3943      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3944      "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
3945  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3946  Fn->setDoesNotRecurse();
3947  CodeGenFunction CGF(CGM);
3948  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3949
3950  CGBuilderTy &Bld = CGF.Builder;
3951
3952  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3953  QualType StaticTy = C.getRecordType(TeamReductionRec);
3954  llvm::Type *LLVMReductionsBufferTy =
3955      CGM.getTypes().ConvertTypeForMem(StaticTy);
3956  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3957      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3958      LLVMReductionsBufferTy->getPointerTo());
3959
3960  // 1. Build a list of reduction variables.
3961  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3962  Address ReductionList =
3963      CGF.CreateMemTemp(ReductionArrayTy".omp.reduction.red_list");
3964  auto IPriv = Privates.begin();
3965  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3966                         CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3967                                              /*Volatile=*/false, C.IntTy,
3968                                              Loc)};
3969  unsigned Idx = 0;
3970  for (unsigned I = 0E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3971    Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionListIdx);
3972    // Global = Buffer.VD[Idx];
3973    const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3974    const FieldDecl *FD = VarFieldMap.lookup(VD);
3975    LValue GlobLVal = CGF.EmitLValueForField(
3976        CGF.MakeNaturalAlignAddrLValue(BufferArrPtrStaticTy), FD);
3977    llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3978    llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3979    CGF.EmitStoreOfScalar(PtrElem/*Volatile=*/falseC.VoidPtrTy);
3980    if ((*IPriv)->getType()->isVariablyModifiedType()) {
3981      // Store array size.
3982      ++Idx;
3983      Elem = CGF.Builder.CreateConstArrayGEP(ReductionListIdx);
3984      llvm::Value *Size = CGF.Builder.CreateIntCast(
3985          CGF.getVLASize(
3986                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3987              .NumElts,
3988          CGF.SizeTy, /*isSigned=*/false);
3989      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(SizeCGF.VoidPtrTy),
3990                              Elem);
3991    }
3992  }
3993
3994  // Call reduce_function(ReduceList, GlobalReduceList)
3995  llvm::Value *GlobalReduceList =
3996      CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3997  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3998  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3999      AddrReduceListArg/*Volatile=*/falseC.VoidPtrTyLoc);
4000  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
4001      CGFLocReduceFn, {ReducedPtrGlobalReduceList});
4002  CGF.FinishFunction();
4003  return Fn;
4004}
4005
4006///
4007/// Design of OpenMP reductions on the GPU
4008///
4009/// Consider a typical OpenMP program with one or more reduction
4010/// clauses:
4011///
4012/// float foo;
4013/// double bar;
4014/// #pragma omp target teams distribute parallel for \
4015///             reduction(+:foo) reduction(*:bar)
4016/// for (int i = 0; i < N; i++) {
4017///   foo += A[i]; bar *= B[i];
4018/// }
4019///
4020/// where 'foo' and 'bar' are reduced across all OpenMP threads in
4021/// all teams.  In our OpenMP implementation on the NVPTX device an
4022/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
4023/// within a team are mapped to CUDA threads within a threadblock.
4024/// Our goal is to efficiently aggregate values across all OpenMP
4025/// threads such that:
4026///
4027///   - the compiler and runtime are logically concise, and
4028///   - the reduction is performed efficiently in a hierarchical
4029///     manner as follows: within OpenMP threads in the same warp,
4030///     across warps in a threadblock, and finally across teams on
4031///     the NVPTX device.
4032///
4033/// Introduction to Decoupling
4034///
4035/// We would like to decouple the compiler and the runtime so that the
4036/// latter is ignorant of the reduction variables (number, data types)
4037/// and the reduction operators.  This allows a simpler interface
4038/// and implementation while still attaining good performance.
4039///
4040/// Pseudocode for the aforementioned OpenMP program generated by the
4041/// compiler is as follows:
4042///
4043/// 1. Create private copies of reduction variables on each OpenMP
4044///    thread: 'foo_private', 'bar_private'
4045/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
4046///    to it and writes the result in 'foo_private' and 'bar_private'
4047///    respectively.
4048/// 3. Call the OpenMP runtime on the GPU to reduce within a team
4049///    and store the result on the team master:
4050///
4051///     __kmpc_nvptx_parallel_reduce_nowait_v2(...,
4052///        reduceData, shuffleReduceFn, interWarpCpyFn)
4053///
4054///     where:
4055///       struct ReduceData {
4056///         double *foo;
4057///         double *bar;
4058///       } reduceData
4059///       reduceData.foo = &foo_private
4060///       reduceData.bar = &bar_private
4061///
4062///     'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
4063///     auxiliary functions generated by the compiler that operate on
4064///     variables of type 'ReduceData'.  They aid the runtime perform
4065///     algorithmic steps in a data agnostic manner.
4066///
4067///     'shuffleReduceFn' is a pointer to a function that reduces data
4068///     of type 'ReduceData' across two OpenMP threads (lanes) in the
4069///     same warp.  It takes the following arguments as input:
4070///
4071///     a. variable of type 'ReduceData' on the calling lane,
4072///     b. its lane_id,
4073///     c. an offset relative to the current lane_id to generate a
4074///        remote_lane_id.  The remote lane contains the second
4075///        variable of type 'ReduceData' that is to be reduced.
4076///     d. an algorithm version parameter determining which reduction
4077///        algorithm to use.
4078///
4079///     'shuffleReduceFn' retrieves data from the remote lane using
4080///     efficient GPU shuffle intrinsics and reduces, using the
4081///     algorithm specified by the 4th parameter, the two operands
4082///     element-wise.  The result is written to the first operand.
4083///
4084///     Different reduction algorithms are implemented in different
4085///     runtime functions, all calling 'shuffleReduceFn' to perform
4086///     the essential reduction step.  Therefore, based on the 4th
4087///     parameter, this function behaves slightly differently to
4088///     cooperate with the runtime to ensure correctness under
4089///     different circumstances.
4090///
4091///     'InterWarpCpyFn' is a pointer to a function that transfers
4092///     reduced variables across warps.  It tunnels, through CUDA
4093///     shared memory, the thread-private data of type 'ReduceData'
4094///     from lane 0 of each warp to a lane in the first warp.
4095/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
4096///    The last team writes the global reduced value to memory.
4097///
4098///     ret = __kmpc_nvptx_teams_reduce_nowait(...,
4099///             reduceData, shuffleReduceFn, interWarpCpyFn,
4100///             scratchpadCopyFn, loadAndReduceFn)
4101///
4102///     'scratchpadCopyFn' is a helper that stores reduced
4103///     data from the team master to a scratchpad array in
4104///     global memory.
4105///
4106///     'loadAndReduceFn' is a helper that loads data from
4107///     the scratchpad array and reduces it with the input
4108///     operand.
4109///
4110///     These compiler generated functions hide address
4111///     calculation and alignment information from the runtime.
4112/// 5. if ret == 1:
4113///     The team master of the last team stores the reduced
4114///     result to the globals in memory.
4115///     foo += reduceData.foo; bar *= reduceData.bar
4116///
4117///
4118/// Warp Reduction Algorithms
4119///
4120/// On the warp level, we have three algorithms implemented in the
4121/// OpenMP runtime depending on the number of active lanes:
4122///
4123/// Full Warp Reduction
4124///
4125/// The reduce algorithm within a warp where all lanes are active
4126/// is implemented in the runtime as follows:
4127///
4128/// full_warp_reduce(void *reduce_data,
4129///                  kmp_ShuffleReductFctPtr ShuffleReduceFn) {
4130///   for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
4131///     ShuffleReduceFn(reduce_data, 0, offset, 0);
4132/// }
4133///
4134/// The algorithm completes in log(2, WARPSIZE) steps.
4135///
4136/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
4137/// not used therefore we save instructions by not retrieving lane_id
4138/// from the corresponding special registers.  The 4th parameter, which
4139/// represents the version of the algorithm being used, is set to 0 to
4140/// signify full warp reduction.
4141///
4142/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4143///
4144/// #reduce_elem refers to an element in the local lane's data structure
4145/// #remote_elem is retrieved from a remote lane
4146/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4147/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
4148///
4149/// Contiguous Partial Warp Reduction
4150///
4151/// This reduce algorithm is used within a warp where only the first
4152/// 'n' (n <= WARPSIZE) lanes are active.  It is typically used when the
4153/// number of OpenMP threads in a parallel region is not a multiple of
4154/// WARPSIZE.  The algorithm is implemented in the runtime as follows:
4155///
4156/// void
4157/// contiguous_partial_reduce(void *reduce_data,
4158///                           kmp_ShuffleReductFctPtr ShuffleReduceFn,
4159///                           int size, int lane_id) {
4160///   int curr_size;
4161///   int offset;
4162///   curr_size = size;
4163///   mask = curr_size/2;
4164///   while (offset>0) {
4165///     ShuffleReduceFn(reduce_data, lane_id, offset, 1);
4166///     curr_size = (curr_size+1)/2;
4167///     offset = curr_size/2;
4168///   }
4169/// }
4170///
4171/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4172///
4173/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4174/// if (lane_id < offset)
4175///     reduce_elem = reduce_elem REDUCE_OP remote_elem
4176/// else
4177///     reduce_elem = remote_elem
4178///
4179/// This algorithm assumes that the data to be reduced are located in a
4180/// contiguous subset of lanes starting from the first.  When there is
4181/// an odd number of active lanes, the data in the last lane is not
4182/// aggregated with any other lane's dat but is instead copied over.
4183///
4184/// Dispersed Partial Warp Reduction
4185///
4186/// This algorithm is used within a warp when any discontiguous subset of
4187/// lanes are active.  It is used to implement the reduction operation
4188/// across lanes in an OpenMP simd region or in a nested parallel region.
4189///
4190/// void
4191/// dispersed_partial_reduce(void *reduce_data,
4192///                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
4193///   int size, remote_id;
4194///   int logical_lane_id = number_of_active_lanes_before_me() * 2;
4195///   do {
4196///       remote_id = next_active_lane_id_right_after_me();
4197///       # the above function returns 0 of no active lane
4198///       # is present right after the current lane.
4199///       size = number_of_active_lanes_in_this_warp();
4200///       logical_lane_id /= 2;
4201///       ShuffleReduceFn(reduce_data, logical_lane_id,
4202///                       remote_id-1-threadIdx.x, 2);
4203///   } while (logical_lane_id % 2 == 0 && size > 1);
4204/// }
4205///
4206/// There is no assumption made about the initial state of the reduction.
4207/// Any number of lanes (>=1) could be active at any position.  The reduction
4208/// result is returned in the first active lane.
4209///
4210/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4211///
4212/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4213/// if (lane_id % 2 == 0 && offset > 0)
4214///     reduce_elem = reduce_elem REDUCE_OP remote_elem
4215/// else
4216///     reduce_elem = remote_elem
4217///
4218///
4219/// Intra-Team Reduction
4220///
4221/// This function, as implemented in the runtime call
4222/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
4223/// threads in a team.  It first reduces within a warp using the
4224/// aforementioned algorithms.  We then proceed to gather all such
4225/// reduced values at the first warp.
4226///
4227/// The runtime makes use of the function 'InterWarpCpyFn', which copies
4228/// data from each of the "warp master" (zeroth lane of each warp, where
4229/// warp-reduced data is held) to the zeroth warp.  This step reduces (in
4230/// a mathematical sense) the problem of reduction across warp masters in
4231/// a block to the problem of warp reduction.
4232///
4233///
4234/// Inter-Team Reduction
4235///
4236/// Once a team has reduced its data to a single value, it is stored in
4237/// a global scratchpad array.  Since each team has a distinct slot, this
4238/// can be done without locking.
4239///
4240/// The last team to write to the scratchpad array proceeds to reduce the
4241/// scratchpad array.  One or more workers in the last team use the helper
4242/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
4243/// the k'th worker reduces every k'th element.
4244///
4245/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
4246/// reduce across workers and compute a globally reduced value.
4247///
4248void CGOpenMPRuntimeNVPTX::emitReduction(
4249    CodeGenFunction &CGFSourceLocation LocArrayRef<const Expr *> Privates,
4250    ArrayRef<const Expr *> LHSExprsArrayRef<const Expr *> RHSExprs,
4251    ArrayRef<const Expr *> ReductionOpsReductionOptionsTy Options) {
4252  if (!CGF.HaveInsertPoint())
4253    return;
4254
4255  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
4256#ifndef NDEBUG
4257  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
4258#endif
4259
4260  if (Options.SimpleReduction) {
4261     (0) . __assert_fail ("!TeamsReduction && !ParallelReduction && \"Invalid reduction selection in emitReduction.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4262, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(!TeamsReduction && !ParallelReduction &&
4262 (0) . __assert_fail ("!TeamsReduction && !ParallelReduction && \"Invalid reduction selection in emitReduction.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4262, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">           "Invalid reduction selection in emitReduction.");
4263    CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
4264                                   ReductionOps, Options);
4265    return;
4266  }
4267
4268   (0) . __assert_fail ("(TeamsReduction || ParallelReduction) && \"Invalid reduction selection in emitReduction.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4269, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert((TeamsReduction || ParallelReduction) &&
4269 (0) . __assert_fail ("(TeamsReduction || ParallelReduction) && \"Invalid reduction selection in emitReduction.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4269, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">         "Invalid reduction selection in emitReduction.");
4270
4271  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4272  // RedList, shuffle_reduce_func, interwarp_copy_func);
4273  // or
4274  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4275  llvm::Value *RTLoc = emitUpdateLocation(CGFLoc);
4276  llvm::Value *ThreadId = getThreadID(CGFLoc);
4277
4278  llvm::Value *Res;
4279  ASTContext &C = CGM.getContext();
4280  // 1. Build a list of reduction variables.
4281  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4282  auto Size = RHSExprs.size();
4283  for (const Expr *E : Privates) {
4284    if (E->getType()->isVariablyModifiedType())
4285      // Reserve place for array size.
4286      ++Size;
4287  }
4288  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
4289  QualType ReductionArrayTy =
4290      C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
4291                             /*IndexTypeQuals=*/0);
4292  Address ReductionList =
4293      CGF.CreateMemTemp(ReductionArrayTy".omp.reduction.red_list");
4294  auto IPriv = Privates.begin();
4295  unsigned Idx = 0;
4296  for (unsigned I = 0E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
4297    Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionListIdx);
4298    CGF.Builder.CreateStore(
4299        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4300            CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
4301        Elem);
4302    if ((*IPriv)->getType()->isVariablyModifiedType()) {
4303      // Store array size.
4304      ++Idx;
4305      Elem = CGF.Builder.CreateConstArrayGEP(ReductionListIdx);
4306      llvm::Value *Size = CGF.Builder.CreateIntCast(
4307          CGF.getVLASize(
4308                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
4309              .NumElts,
4310          CGF.SizeTy, /*isSigned=*/false);
4311      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(SizeCGF.VoidPtrTy),
4312                              Elem);
4313    }
4314  }
4315
4316  llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4317      ReductionList.getPointer(), CGF.VoidPtrTy);
4318  llvm::Function *ReductionFn = emitReductionFunction(
4319      Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
4320      LHSExprs, RHSExprs, ReductionOps);
4321  llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
4322  llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
4323      CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
4324  llvm::Value *InterWarpCopyFn =
4325      emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
4326
4327  if (ParallelReduction) {
4328    llvm::Value *Args[] = {RTLoc,
4329                           ThreadId,
4330                           CGF.Builder.getInt32(RHSExprs.size()),
4331                           ReductionArrayTySize,
4332                           RL,
4333                           ShuffleAndReduceFn,
4334                           InterWarpCopyFn};
4335
4336    Res = CGF.EmitRuntimeCall(
4337        createNVPTXRuntimeFunction(
4338            OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2),
4339        Args);
4340  } else {
4341     (0) . __assert_fail ("TeamsReduction && \"expected teams reduction.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4341, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(TeamsReduction && "expected teams reduction.");
4342    llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
4343    llvm::SmallVector<const ValueDecl *, 4PrivatesReductions(Privates.size());
4344    int Cnt = 0;
4345    for (const Expr *DRE : Privates) {
4346      PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
4347      ++Cnt;
4348    }
4349    const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
4350        CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
4351        C.getLangOpts().OpenMPCUDAReductionBufNum);
4352    TeamsReductions.push_back(TeamReductionRec);
4353    if (!KernelTeamsReductionPtr) {
4354      KernelTeamsReductionPtr = new llvm::GlobalVariable(
4355          CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
4356          llvm::GlobalValue::InternalLinkage, nullptr,
4357          "_openmp_teams_reductions_buffer_$_$ptr");
4358    }
4359    llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
4360        Address(KernelTeamsReductionPtrCGM.getPointerAlign()),
4361        /*Volatile=*/falseC.getPointerType(C.VoidPtrTy), Loc);
4362    llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
4363        CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
4364    llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
4365        CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
4366        ReductionFn);
4367    llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
4368        CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
4369    llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
4370        CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
4371        ReductionFn);
4372
4373    llvm::Value *Args[] = {
4374        RTLoc,
4375        ThreadId,
4376        GlobalBufferPtr,
4377        CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
4378        RL,
4379        ShuffleAndReduceFn,
4380        InterWarpCopyFn,
4381        GlobalToBufferCpyFn,
4382        GlobalToBufferRedFn,
4383        BufferToGlobalCpyFn,
4384        BufferToGlobalRedFn};
4385
4386    Res = CGF.EmitRuntimeCall(
4387        createNVPTXRuntimeFunction(
4388            OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2),
4389        Args);
4390  }
4391
4392  // 5. Build if (res == 1)
4393  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
4394  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
4395  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
4396      Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
4397  CGF.Builder.CreateCondBr(CondThenBBExitBB);
4398
4399  // 6. Build then branch: where we have reduced values in the master
4400  //    thread in each team.
4401  //    __kmpc_end_reduce{_nowait}(<gtid>);
4402  //    break;
4403  CGF.EmitBlock(ThenBB);
4404
4405  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4406  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
4407                    this](CodeGenFunction &CGFPrePostActionTy &Action) {
4408    auto IPriv = Privates.begin();
4409    auto ILHS = LHSExprs.begin();
4410    auto IRHS = RHSExprs.begin();
4411    for (const Expr *E : ReductionOps) {
4412      emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
4413                                  cast<DeclRefExpr>(*IRHS));
4414      ++IPriv;
4415      ++ILHS;
4416      ++IRHS;
4417    }
4418  };
4419  llvm::Value *EndArgs[] = {ThreadId};
4420  RegionCodeGenTy RCG(CodeGen);
4421  NVPTXActionTy Action(
4422      nullptr, llvm::None,
4423      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
4424      EndArgs);
4425  RCG.setAction(Action);
4426  RCG(CGF);
4427  // There is no need to emit line number for unconditional branch.
4428  (void)ApplyDebugLocation::CreateEmpty(CGF);
4429  CGF.EmitBlock(ExitBB/*IsFinished=*/true);
4430}
4431
4432const VarDecl *
4433CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD,
4434                                         const VarDecl *NativeParamconst {
4435  if (!NativeParam->getType()->isReferenceType())
4436    return NativeParam;
4437  QualType ArgType = NativeParam->getType();
4438  QualifierCollector QC;
4439  const Type *NonQualTy = QC.strip(ArgType);
4440  QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4441  if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
4442    if (Attr->getCaptureKind() == OMPC_map) {
4443      PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4444                                                        LangAS::opencl_global);
4445    } else if (Attr->getCaptureKind() == OMPC_firstprivate &&
4446               PointeeTy.isConstant(CGM.getContext())) {
4447      PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4448                                                        LangAS::opencl_generic);
4449    }
4450  }
4451  ArgType = CGM.getContext().getPointerType(PointeeTy);
4452  QC.addRestrict();
4453  enum { NVPTX_local_addr = 5 };
4454  QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
4455  ArgType = QC.apply(CGM.getContext(), ArgType);
4456  if (isa<ImplicitParamDecl>(NativeParam))
4457    return ImplicitParamDecl::Create(
4458        CGM.getContext(), /*DC=*/nullptrNativeParam->getLocation(),
4459        NativeParam->getIdentifier(), ArgTypeImplicitParamDecl::Other);
4460  return ParmVarDecl::Create(
4461      CGM.getContext(),
4462      const_cast<DeclContext *>(NativeParam->getDeclContext()),
4463      NativeParam->getBeginLoc(), NativeParam->getLocation(),
4464      NativeParam->getIdentifier(), ArgType,
4465      /*TInfo=*/nullptrSC_None/*DefArg=*/nullptr);
4466}
4467
4468Address
4469CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF,
4470                                          const VarDecl *NativeParam,
4471                                          const VarDecl *TargetParamconst {
4472   (0) . __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4474, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(NativeParam != TargetParam &&
4473 (0) . __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4474, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">         NativeParam->getType()->isReferenceType() &&
4474 (0) . __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4474, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">         "Native arg must not be the same as target arg.");
4475  Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
4476  QualType NativeParamType = NativeParam->getType();
4477  QualifierCollector QC;
4478  const Type *NonQualTy = QC.strip(NativeParamType);
4479  QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4480  unsigned NativePointeeAddrSpace =
4481      CGF.getContext().getTargetAddressSpace(NativePointeeTy);
4482  QualType TargetTy = TargetParam->getType();
4483  llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
4484      LocalAddr/*Volatile=*/falseTargetTySourceLocation());
4485  // First cast to generic.
4486  TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4487      TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4488                      /*AddrSpace=*/0));
4489  // Cast from generic to native address space.
4490  TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4491      TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4492                      NativePointeeAddrSpace));
4493  Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
4494  CGF.EmitStoreOfScalar(TargetAddrNativeParamAddr/*Volatile=*/false,
4495                        NativeParamType);
4496  return NativeParamAddr;
4497}
4498
4499void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
4500    CodeGenFunction &CGFSourceLocation Loc, llvm::FunctionCallee OutlinedFn,
4501    ArrayRef<llvm::Value *> Argsconst {
4502  SmallVector<llvm::Value *, 4TargetArgs;
4503  TargetArgs.reserve(Args.size());
4504  auto *FnType = OutlinedFn.getFunctionType();
4505  for (unsigned I = 0E = Args.size(); I < E; ++I) {
4506    if (FnType->isVarArg() && FnType->getNumParams() <= I) {
4507      TargetArgs.append(std::next(Args.begin(), I), Args.end());
4508      break;
4509    }
4510    llvm::Type *TargetType = FnType->getParamType(I);
4511    llvm::Value *NativeArg = Args[I];
4512    if (!TargetType->isPointerTy()) {
4513      TargetArgs.emplace_back(NativeArg);
4514      continue;
4515    }
4516    llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4517        NativeArg,
4518        NativeArg->getType()->getPointerElementType()->getPointerTo());
4519    TargetArgs.emplace_back(
4520        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4521  }
4522  CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
4523}
4524
4525/// Emit function which wraps the outline parallel region
4526/// and controls the arguments which are passed to this function.
4527/// The wrapper ensures that the outlined function is called
4528/// with the correct arguments when data is shared.
4529llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
4530    llvm::Function *OutlinedParallelFnconst OMPExecutableDirective &D) {
4531  ASTContext &Ctx = CGM.getContext();
4532  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4533
4534  // Create a function that takes as argument the source thread.
4535  FunctionArgList WrapperArgs;
4536  QualType Int16QTy =
4537      Ctx.getIntTypeForBitwidth(/*DestWidth=*/16/*Signed=*/false);
4538  QualType Int32QTy =
4539      Ctx.getIntTypeForBitwidth(/*DestWidth=*/32/*Signed=*/false);
4540  ImplicitParamDecl ParallelLevelArg(Ctx/*DC=*/nullptrD.getBeginLoc(),
4541                                     /*Id=*/nullptrInt16QTy,
4542                                     ImplicitParamDecl::Other);
4543  ImplicitParamDecl WrapperArg(Ctx/*DC=*/nullptrD.getBeginLoc(),
4544                               /*Id=*/nullptrInt32QTy,
4545                               ImplicitParamDecl::Other);
4546  WrapperArgs.emplace_back(&ParallelLevelArg);
4547  WrapperArgs.emplace_back(&WrapperArg);
4548
4549  const CGFunctionInfo &CGFI =
4550      CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTyWrapperArgs);
4551
4552  auto *Fn = llvm::Function::Create(
4553      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
4554      Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
4555  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
4556  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
4557  Fn->setDoesNotRecurse();
4558
4559  CodeGenFunction CGF(CGM/*suppressNewContext=*/true);
4560  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
4561                    D.getBeginLoc(), D.getBeginLoc());
4562
4563  const auto *RD = CS.getCapturedRecordDecl();
4564  auto CurField = RD->field_begin();
4565
4566  Address ZeroAddr = CGF.CreateMemTemp(
4567      CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32/*Signed=*/1),
4568      /*Name*/ ".zero.addr");
4569  CGF.InitTempAlloca(ZeroAddrCGF.Builder.getInt32(/*C*/ 0));
4570  // Get the array of arguments.
4571  SmallVector<llvm::Value *, 8Args;
4572
4573  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4574  Args.emplace_back(ZeroAddr.getPointer());
4575
4576  CGBuilderTy &Bld = CGF.Builder;
4577  auto CI = CS.capture_begin();
4578
4579  // Use global memory for data sharing.
4580  // Handle passing of global args to workers.
4581  Address GlobalArgs =
4582      CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy"global_args");
4583  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4584  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4585  CGF.EmitRuntimeCall(
4586      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
4587      DataSharingArgs);
4588
4589  // Retrieve the shared variables from the list of references returned
4590  // by the runtime. Pass the variables to the outlined function.
4591  Address SharedArgListAddress = Address::invalid();
4592  if (CS.capture_size() > 0 ||
4593      isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4594    SharedArgListAddress = CGF.EmitLoadOfPointer(
4595        GlobalArgsCGF.getContext()
4596                        .getPointerType(CGF.getContext().getPointerType(
4597                            CGF.getContext().VoidPtrTy))
4598                        .castAs<PointerType>());
4599  }
4600  unsigned Idx = 0;
4601  if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4602    Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddressIdx);
4603    Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4604        Src, CGF.SizeTy->getPointerTo());
4605    llvm::Value *LB = CGF.EmitLoadOfScalar(
4606        TypedAddress,
4607        /*Volatile=*/false,
4608        CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4609        cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4610    Args.emplace_back(LB);
4611    ++Idx;
4612    Src = Bld.CreateConstInBoundsGEP(SharedArgListAddressIdx);
4613    TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4614        Src, CGF.SizeTy->getPointerTo());
4615    llvm::Value *UB = CGF.EmitLoadOfScalar(
4616        TypedAddress,
4617        /*Volatile=*/false,
4618        CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4619        cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4620    Args.emplace_back(UB);
4621    ++Idx;
4622  }
4623  if (CS.capture_size() > 0) {
4624    ASTContext &CGFContext = CGF.getContext();
4625    for (unsigned I = 0E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4626      QualType ElemTy = CurField->getType();
4627      Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddressI + Idx);
4628      Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4629          SrcCGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4630      llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4631                                              /*Volatile=*/false,
4632                                              CGFContext.getPointerType(ElemTy),
4633                                              CI->getLocation());
4634      if (CI->capturesVariableByCopy() &&
4635          !CI->getCapturedVar()->getType()->isAnyPointerType()) {
4636        Arg = castValueToType(CGFArgElemTyCGFContext.getUIntPtrType(),
4637                              CI->getLocation());
4638      }
4639      Args.emplace_back(Arg);
4640    }
4641  }
4642
4643  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
4644  CGF.FinishFunction();
4645  return Fn;
4646}
4647
4648void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
4649                                              const Decl *D) {
4650  if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
4651    return;
4652
4653   (0) . __assert_fail ("D && \"Expected function or captured|block decl.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4653, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(D && "Expected function or captured|block decl.");
4654   (0) . __assert_fail ("FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && \"Function is registered already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4655, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4655 (0) . __assert_fail ("FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && \"Function is registered already.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4655, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">         "Function is registered already.");
4656   (0) . __assert_fail ("(!TeamAndReductions.first || TeamAndReductions.first == D) && \"Team is set but not processed.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4657, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
4657 (0) . __assert_fail ("(!TeamAndReductions.first || TeamAndReductions.first == D) && \"Team is set but not processed.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4657, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">         "Team is set but not processed.");
4658  const Stmt *Body = nullptr;
4659  bool NeedToDelayGlobalization = false;
4660  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4661    Body = FD->getBody();
4662  } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4663    Body = BD->getBody();
4664  } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4665    Body = CD->getBody();
4666    NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
4667    if (NeedToDelayGlobalization &&
4668        getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
4669      return;
4670  }
4671  if (!Body)
4672    return;
4673  CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
4674  VarChecker.Visit(Body);
4675  const RecordDecl *GlobalizedVarsRecord =
4676      VarChecker.getGlobalizedRecord(IsInTTDRegion);
4677  TeamAndReductions.first = nullptr;
4678  TeamAndReductions.second.clear();
4679  ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4680      VarChecker.getEscapedVariableLengthDecls();
4681  if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
4682    return;
4683  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4684  I->getSecond().MappedParams =
4685      llvm::make_unique<CodeGenFunction::OMPMapVars>();
4686  I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4687  I->getSecond().EscapedParameters.insert(
4688      VarChecker.getEscapedParameters().begin(),
4689      VarChecker.getEscapedParameters().end());
4690  I->getSecond().EscapedVariableLengthDecls.append(
4691      EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
4692  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
4693  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4694     (0) . __assert_fail ("VD->isCanonicalDecl() && \"Expected canonical declaration\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4694, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4695    const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4696    Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
4697  }
4698  if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
4699    CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
4700    VarChecker.Visit(Body);
4701    I->getSecond().SecondaryGlobalRecord =
4702        VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
4703    I->getSecond().SecondaryLocalVarData.emplace();
4704    DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4705    for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4706       (0) . __assert_fail ("VD->isCanonicalDecl() && \"Expected canonical declaration\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4706, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4707      const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4708      Data.insert(
4709          std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
4710    }
4711  }
4712  if (!NeedToDelayGlobalization) {
4713    emitGenericVarsProlog(CGFD->getBeginLoc(), /*WithSPMDCheck=*/true);
4714    struct GlobalizationScope final : EHScopeStack::Cleanup {
4715      GlobalizationScope() = default;
4716
4717      void Emit(CodeGenFunction &CGFFlags flags) override {
4718        static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
4719            .emitGenericVarsEpilog(CGF/*WithSPMDCheck=*/true);
4720      }
4721    };
4722    CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
4723  }
4724}
4725
4726Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
4727                                                        const VarDecl *VD) {
4728  bool UseDefaultAllocator = true;
4729  if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
4730    const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4731    switch (A->getAllocatorType()) {
4732      // Use the default allocator here as by default local vars are
4733      // threadlocal.
4734    case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4735    case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4736      // Just pass-through to check if the globalization is required.
4737      break;
4738    case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4739    case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4740    case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4741    case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4742    case OMPAllocateDeclAttr::OMPConstMemAlloc:
4743    case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4744    case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4745      UseDefaultAllocator = false;
4746      break;
4747    }
4748  }
4749
4750  if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
4751    return Address::invalid();
4752
4753  VD = VD->getCanonicalDecl();
4754  auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4755  if (I == FunctionGlobalizedDecls.end())
4756    return Address::invalid();
4757  auto VDI = I->getSecond().LocalVarData.find(VD);
4758  if (VDI != I->getSecond().LocalVarData.end())
4759    return VDI->second.PrivateAddr;
4760  if (VD->hasAttrs()) {
4761    for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
4762         E(VD->attr_end());
4763         IT != E; ++IT) {
4764      auto VDI = I->getSecond().LocalVarData.find(
4765          cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4766              ->getCanonicalDecl());
4767      if (VDI != I->getSecond().LocalVarData.end())
4768        return VDI->second.PrivateAddr;
4769    }
4770  }
4771
4772  // TODO: replace it with return
4773  // UseDefaultAllocator ? Address::invalid :
4774  // CGOpenMPRuntime::getAddressOfLocalVariable(CGF, VD); when NVPTX libomp
4775  // supports __kmpc_alloc|__kmpc_free.
4776  (void)UseDefaultAllocator// Prevent a warning.
4777  return Address::invalid();
4778}
4779
4780void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) {
4781  FunctionGlobalizedDecls.erase(CGF.CurFn);
4782  CGOpenMPRuntime::functionFinished(CGF);
4783}
4784
4785void CGOpenMPRuntimeNVPTX::getDefaultDistScheduleAndChunk(
4786    CodeGenFunction &CGFconst OMPLoopDirective &S,
4787    OpenMPDistScheduleClauseKind &ScheduleKind,
4788    llvm::Value *&Chunkconst {
4789  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
4790    ScheduleKind = OMPC_DIST_SCHEDULE_static;
4791    Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
4792        CGF.getContext().getIntTypeForBitwidth(32/*Signed=*/0),
4793        S.getIterationVariable()->getType(), S.getBeginLoc());
4794    return;
4795  }
4796  CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
4797      CGFSScheduleKindChunk);
4798}
4799
4800void CGOpenMPRuntimeNVPTX::getDefaultScheduleAndChunk(
4801    CodeGenFunction &CGFconst OMPLoopDirective &S,
4802    OpenMPScheduleClauseKind &ScheduleKind,
4803    const Expr *&ChunkExprconst {
4804  ScheduleKind = OMPC_SCHEDULE_static;
4805  // Chunk size is 1 in this case.
4806  llvm::APInt ChunkSize(321);
4807  ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4808      CGF.getContext().getIntTypeForBitwidth(32/*Signed=*/0),
4809      SourceLocation());
4810}
4811
4812void CGOpenMPRuntimeNVPTX::adjustTargetSpecificDataForLambdas(
4813    CodeGenFunction &CGFconst OMPExecutableDirective &Dconst {
4814   (0) . __assert_fail ("isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && \" Expected target-based directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4815, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
4815 (0) . __assert_fail ("isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && \" Expected target-based directive.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4815, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">         " Expected target-based directive.");
4816  const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4817  for (const CapturedStmt::Capture &C : CS->captures()) {
4818    // Capture variables captured by reference in lambdas for target-based
4819    // directives.
4820    if (!C.capturesVariable())
4821      continue;
4822    const VarDecl *VD = C.getCapturedVar();
4823    const auto *RD = VD->getType()
4824                         .getCanonicalType()
4825                         .getNonReferenceType()
4826                         ->getAsCXXRecordDecl();
4827    if (!RD || !RD->isLambda())
4828      continue;
4829    Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4830    LValue VDLVal;
4831    if (VD->getType().getCanonicalType()->isReferenceType())
4832      VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4833    else
4834      VDLVal = CGF.MakeAddrLValue(
4835          VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4836    llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
4837    FieldDecl *ThisCapture = nullptr;
4838    RD->getCaptureFields(Captures, ThisCapture);
4839    if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4840      LValue ThisLVal =
4841          CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4842      llvm::Value *CXXThis = CGF.LoadCXXThis();
4843      CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4844    }
4845    for (const LambdaCapture &LC : RD->captures()) {
4846      if (LC.getCaptureKind() != LCK_ByRef)
4847        continue;
4848      const VarDecl *VD = LC.getCapturedVar();
4849      if (!CS->capturesVariable(VD))
4850        continue;
4851      auto It = Captures.find(VD);
4852       (0) . __assert_fail ("It != Captures.end() && \"Found lambda capture without field.\"", "/home/seafit/code_projects/clang_source/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp", 4852, __PRETTY_FUNCTION__))" file_link="../../../include/assert.h.html#88" macro="true">assert(It != Captures.end() && "Found lambda capture without field.");
4853      LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4854      Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4855      if (VD->getType().getCanonicalType()->isReferenceType())
4856        VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4857                                               VD->getType().getCanonicalType())
4858                     .getAddress();
4859      CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4860    }
4861  }
4862}
4863
4864unsigned CGOpenMPRuntimeNVPTX::getDefaultFirstprivateAddressSpace() const {
4865  return CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant);
4866}
4867
4868bool CGOpenMPRuntimeNVPTX::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
4869                                                            LangAS &AS) {
4870  if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
4871    return false;
4872  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4873  switch(A->getAllocatorType()) {
4874  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4875  // Not supported, fallback to the default mem space.
4876  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4877  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4878  case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4879  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4880  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4881    AS = LangAS::Default;
4882    return true;
4883  case OMPAllocateDeclAttr::OMPConstMemAlloc:
4884    AS = LangAS::cuda_constant;
4885    return true;
4886  case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4887    AS = LangAS::cuda_shared;
4888    return true;
4889  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4890    llvm_unreachable("Expected predefined allocator for the variables with the "
4891                     "static storage.");
4892  }
4893  return false;
4894}
4895
4896// Get current CudaArch and ignore any unknown values
4897static CudaArch getCudaArch(CodeGenModule &CGM) {
4898  if (!CGM.getTarget().hasFeature("ptx"))
4899    return CudaArch::UNKNOWN;
4900  llvm::StringMap<bool> Features;
4901  CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
4902                                 CGM.getTarget().getTargetOpts().CPU,
4903                                 CGM.getTarget().getTargetOpts().Features);
4904  for (const auto &Feature : Features) {
4905    if (Feature.getValue()) {
4906      CudaArch Arch = StringToCudaArch(Feature.getKey());
4907      if (Arch != CudaArch::UNKNOWN)
4908        return Arch;
4909    }
4910  }
4911  return CudaArch::UNKNOWN;
4912}
4913
4914/// Check to see if target architecture supports unified addressing which is
4915/// a restriction for OpenMP requires clause "unified_shared_memory".
4916void CGOpenMPRuntimeNVPTX::checkArchForUnifiedAddressing(
4917    const OMPRequiresDecl *Dconst {
4918  for (const OMPClause *Clause : D->clauselists()) {
4919    if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
4920      switch (getCudaArch(CGM)) {
4921      case CudaArch::SM_20:
4922      case CudaArch::SM_21:
4923      case CudaArch::SM_30:
4924      case CudaArch::SM_32:
4925      case CudaArch::SM_35:
4926      case CudaArch::SM_37:
4927      case CudaArch::SM_50:
4928      case CudaArch::SM_52:
4929      case CudaArch::SM_53:
4930      case CudaArch::SM_60:
4931      case CudaArch::SM_61:
4932      case CudaArch::SM_62:
4933        CGM.Error(Clause->getBeginLoc(),
4934                  "Target architecture does not support unified addressing");
4935        return;
4936      case CudaArch::SM_70:
4937      case CudaArch::SM_72:
4938      case CudaArch::SM_75:
4939      case CudaArch::GFX600:
4940      case CudaArch::GFX601:
4941      case CudaArch::GFX700:
4942      case CudaArch::GFX701:
4943      case CudaArch::GFX702:
4944      case CudaArch::GFX703:
4945      case CudaArch::GFX704:
4946      case CudaArch::GFX801:
4947      case CudaArch::GFX802:
4948      case CudaArch::GFX803:
4949      case CudaArch::GFX810:
4950      case CudaArch::GFX900:
4951      case CudaArch::GFX902:
4952      case CudaArch::GFX904:
4953      case CudaArch::GFX906:
4954      case CudaArch::GFX909:
4955      case CudaArch::UNKNOWN:
4956        break;
4957      case CudaArch::LAST:
4958        llvm_unreachable("Unexpected Cuda arch.");
4959      }
4960    }
4961  }
4962}
4963
4964/// Get number of SMs and number of blocks per SM.
4965static std::pair<unsignedunsignedgetSMsBlocksPerSM(CodeGenModule &CGM) {
4966  std::pair<unsignedunsignedData;
4967  if (CGM.getLangOpts().OpenMPCUDANumSMs)
4968    Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
4969  if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
4970    Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
4971  if (Data.first && Data.second)
4972    return Data;
4973  switch (getCudaArch(CGM)) {
4974  case CudaArch::SM_20:
4975  case CudaArch::SM_21:
4976  case CudaArch::SM_30:
4977  case CudaArch::SM_32:
4978  case CudaArch::SM_35:
4979  case CudaArch::SM_37:
4980  case CudaArch::SM_50:
4981  case CudaArch::SM_52:
4982  case CudaArch::SM_53:
4983    return {1616};
4984  case CudaArch::SM_60:
4985  case CudaArch::SM_61:
4986  case CudaArch::SM_62:
4987    return {5632};
4988  case CudaArch::SM_70:
4989  case CudaArch::SM_72:
4990  case CudaArch::SM_75:
4991    return {8432};
4992  case CudaArch::GFX600:
4993  case CudaArch::GFX601:
4994  case CudaArch::GFX700:
4995  case CudaArch::GFX701:
4996  case CudaArch::GFX702:
4997  case CudaArch::GFX703:
4998  case CudaArch::GFX704:
4999  case CudaArch::GFX801:
5000  case CudaArch::GFX802:
5001  case CudaArch::GFX803:
5002  case CudaArch::GFX810:
5003  case CudaArch::GFX900:
5004  case CudaArch::GFX902:
5005  case CudaArch::GFX904:
5006  case CudaArch::GFX906:
5007  case CudaArch::GFX909:
5008  case CudaArch::UNKNOWN:
5009    break;
5010  case CudaArch::LAST:
5011    llvm_unreachable("Unexpected Cuda arch.");
5012  }
5013  llvm_unreachable("Unexpected NVPTX target without ptx feature.");
5014}
5015
5016void CGOpenMPRuntimeNVPTX::clear() {
5017  if (!GlobalizedRecords.empty()) {
5018    ASTContext &C = CGM.getContext();
5019    llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4GlobalRecs;
5020    llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4SharedRecs;
5021    RecordDecl *StaticRD = C.buildImplicitRecord(
5022        "_openmp_static_memory_type_$_"RecordDecl::TagKind::TTK_Union);
5023    StaticRD->startDefinition();
5024    RecordDecl *SharedStaticRD = C.buildImplicitRecord(
5025        "_shared_openmp_static_memory_type_$_"RecordDecl::TagKind::TTK_Union);
5026    SharedStaticRD->startDefinition();
5027    for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
5028      if (Records.Records.empty())
5029        continue;
5030      unsigned Size = 0;
5031      unsigned RecAlignment = 0;
5032      for (const RecordDecl *RD : Records.Records) {
5033        QualType RDTy = C.getRecordType(RD);
5034        unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
5035        RecAlignment = std::max(RecAlignment, Alignment);
5036        unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
5037        Size =
5038            llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
5039      }
5040      Size = llvm::alignTo(Size, RecAlignment);
5041      llvm::APInt ArySize(/*numBits=*/64, Size);
5042      QualType SubTy = C.getConstantArrayType(
5043          C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
5044      const bool UseSharedMemory = Size <= SharedMemorySize;
5045      auto *Field =
5046          FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
5047                            SourceLocation(), SourceLocation(), nullptr, SubTy,
5048                            C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
5049                            /*BW=*/nullptr/*Mutable=*/false,
5050                            /*InitStyle=*/ICIS_NoInit);
5051      Field->setAccess(AS_public);
5052      if (UseSharedMemory) {
5053        SharedStaticRD->addDecl(Field);
5054        SharedRecs.push_back(&Records);
5055      } else {
5056        StaticRD->addDecl(Field);
5057        GlobalRecs.push_back(&Records);
5058      }
5059      Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
5060      Records.UseSharedMemory->setInitializer(
5061          llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
5062    }
5063    // Allocate SharedMemorySize buffer for the shared memory.
5064    // FIXME: nvlink does not handle weak linkage correctly (object with the
5065    // different size are reported as erroneous).
5066    // Restore this code as sson as nvlink is fixed.
5067    if (!SharedStaticRD->field_empty()) {
5068      llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
5069      QualType SubTy = C.getConstantArrayType(
5070          C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
5071      auto *Field = FieldDecl::Create(
5072          CSharedStaticRDSourceLocation(), SourceLocation(), nullptrSubTy,
5073          C.getTrivialTypeSourceInfo(SubTySourceLocation()),
5074          /*BW=*/nullptr/*Mutable=*/false,
5075          /*InitStyle=*/ICIS_NoInit);
5076      Field->setAccess(AS_public);
5077      SharedStaticRD->addDecl(Field);
5078    }
5079    SharedStaticRD->completeDefinition();
5080    if (!SharedStaticRD->field_empty()) {
5081      QualType StaticTy = C.getRecordType(SharedStaticRD);
5082      llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
5083      auto *GV = new llvm::GlobalVariable(
5084          CGM.getModule(), LLVMStaticTy,
5085          /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
5086          llvm::Constant::getNullValue(LLVMStaticTy),
5087          "_openmp_shared_static_glob_rd_$_"/*InsertBefore=*/nullptr,
5088          llvm::GlobalValue::NotThreadLocal,
5089          C.getTargetAddressSpace(LangAS::cuda_shared));
5090      auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
5091          GV, CGM.VoidPtrTy);
5092      for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
5093        Rec->Buffer->replaceAllUsesWith(Replacement);
5094        Rec->Buffer->eraseFromParent();
5095      }
5096    }
5097    StaticRD->completeDefinition();
5098    if (!StaticRD->field_empty()) {
5099      QualType StaticTy = C.getRecordType(StaticRD);
5100      std::pair<unsignedunsignedSMsBlockPerSM = getSMsBlocksPerSM(CGM);
5101      llvm::APInt Size1(32, SMsBlockPerSM.second);
5102      QualType Arr1Ty =
5103          C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal,
5104                                 /*IndexTypeQuals=*/0);
5105      llvm::APInt Size2(32, SMsBlockPerSM.first);
5106      QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal,
5107                                               /*IndexTypeQuals=*/0);
5108      llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
5109      // FIXME: nvlink does not handle weak linkage correctly (object with the
5110      // different size are reported as erroneous).
5111      // Restore CommonLinkage as soon as nvlink is fixed.
5112      auto *GV = new llvm::GlobalVariable(
5113          CGM.getModule(), LLVMArr2Ty,
5114          /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
5115          llvm::Constant::getNullValue(LLVMArr2Ty),
5116          "_openmp_static_glob_rd_$_");
5117      auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
5118          GV, CGM.VoidPtrTy);
5119      for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
5120        Rec->Buffer->replaceAllUsesWith(Replacement);
5121        Rec->Buffer->eraseFromParent();
5122      }
5123    }
5124  }
5125  if (!TeamsReductions.empty()) {
5126    ASTContext &C = CGM.getContext();
5127    RecordDecl *StaticRD = C.buildImplicitRecord(
5128        "_openmp_teams_reduction_type_$_"RecordDecl::TagKind::TTK_Union);
5129    StaticRD->startDefinition();
5130    for (const RecordDecl *TeamReductionRec : TeamsReductions) {
5131      QualType RecTy = C.getRecordType(TeamReductionRec);
5132      auto *Field = FieldDecl::Create(
5133          C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
5134          C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
5135          /*BW=*/nullptr/*Mutable=*/false,
5136          /*InitStyle=*/ICIS_NoInit);
5137      Field->setAccess(AS_public);
5138      StaticRD->addDecl(Field);
5139    }
5140    StaticRD->completeDefinition();
5141    QualType StaticTy = C.getRecordType(StaticRD);
5142    llvm::Type *LLVMReductionsBufferTy =
5143        CGM.getTypes().ConvertTypeForMem(StaticTy);
5144    // FIXME: nvlink does not handle weak linkage correctly (object with the
5145    // different size are reported as erroneous).
5146    // Restore CommonLinkage as soon as nvlink is fixed.
5147    auto *GV = new llvm::GlobalVariable(
5148        CGM.getModule(), LLVMReductionsBufferTy,
5149        /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
5150        llvm::Constant::getNullValue(LLVMReductionsBufferTy),
5151        "_openmp_teams_reductions_buffer_$_");
5152    KernelTeamsReductionPtr->setInitializer(
5153        llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
5154                                                             CGM.VoidPtrTy));
5155  }
5156  CGOpenMPRuntime::clear();
5157}
5158
clang::CodeGen::CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction
clang::CodeGen::CGOpenMPRuntimeNVPTX::getExecutionMode
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitNonSPMDKernel
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitSPMDKernel
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitWorkerFunction
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitWorkerLoop
clang::CodeGen::CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction
clang::CodeGen::CGOpenMPRuntimeNVPTX::createOffloadEntry
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction
clang::CodeGen::CGOpenMPRuntimeNVPTX::getDefaultLocationReserved2Flags
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitProcBindClause
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitNumThreadsClause
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitNumTeamsClause
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitGenericVarsProlog
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitTeamsCall
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitParallelCall
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitSPMDParallelCall
clang::CodeGen::CGOpenMPRuntimeNVPTX::syncCTAThreads
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitBarrierCall
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitCriticalRegion
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitReduction
clang::CodeGen::CGOpenMPRuntimeNVPTX::translateParameter
clang::CodeGen::CGOpenMPRuntimeNVPTX::getParameterAddress
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall
clang::CodeGen::CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper
clang::CodeGen::CGOpenMPRuntimeNVPTX::emitFunctionProlog
clang::CodeGen::CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable
clang::CodeGen::CGOpenMPRuntimeNVPTX::functionFinished
clang::CodeGen::CGOpenMPRuntimeNVPTX::getDefaultDistScheduleAndChunk
clang::CodeGen::CGOpenMPRuntimeNVPTX::getDefaultScheduleAndChunk
clang::CodeGen::CGOpenMPRuntimeNVPTX::adjustTargetSpecificDataForLambdas
clang::CodeGen::CGOpenMPRuntimeNVPTX::getDefaultFirstprivateAddressSpace
clang::CodeGen::CGOpenMPRuntimeNVPTX::hasAllocateAttributeForGlobalVar
clang::CodeGen::CGOpenMPRuntimeNVPTX::checkArchForUnifiedAddressing
clang::CodeGen::CGOpenMPRuntimeNVPTX::clear