/usr/src/gnu/usr.bin/clang/libclangCodeGen/../../../llvm/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Bug Summary

File:	src/gnu/usr.bin/clang/libclangCodeGen/../../../llvm/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
Warning:	line 1431, column 5 Value stored to 'Size' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -cc1 -triple amd64-unknown-openbsd7.0 -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name CGOpenMPRuntimeGPU.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model static -mframe-pointer=all -relaxed-aliasing -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/usr/src/gnu/usr.bin/clang/libclangCodeGen/obj -resource-dir /usr/local/lib/clang/13.0.0 -I /usr/src/gnu/usr.bin/clang/libclangCodeGen/../../../llvm/clang/include -I /usr/src/gnu/usr.bin/clang/libclangCodeGen/../../../llvm/llvm/include -I /usr/src/gnu/usr.bin/clang/libclangCodeGen/../include -I /usr/src/gnu/usr.bin/clang/libclangCodeGen/obj -I /usr/src/gnu/usr.bin/clang/libclangCodeGen/obj/../include -D NDEBUG -D __STDC_LIMIT_MACROS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D LLVM_PREFIX="/usr" -internal-isystem /usr/include/c++/v1 -internal-isystem /usr/local/lib/clang/13.0.0/include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/usr/src/gnu/usr.bin/clang/libclangCodeGen/obj -ferror-limit 19 -fvisibility-inlines-hidden -fwrapv -stack-protector 2 -fno-rtti -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-valloc -fno-builtin-free -fno-builtin-strdup -fno-builtin-strndup -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/ben/Projects/vmm/scan-build/2022-01-12-194120-40624-1 -x c++ /usr/src/gnu/usr.bin/clang/libclangCodeGen/../../../llvm/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

1	//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This provides a generalized class for OpenMP runtime code generation
10	// specialized by GPU targets NVPTX and AMDGCN.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "CGOpenMPRuntimeGPU.h"
15	#include "CGOpenMPRuntimeNVPTX.h"
16	#include "CodeGenFunction.h"
17	#include "clang/AST/Attr.h"
18	#include "clang/AST/DeclOpenMP.h"
19	#include "clang/AST/StmtOpenMP.h"
20	#include "clang/AST/StmtVisitor.h"
21	#include "clang/Basic/Cuda.h"
22	#include "llvm/ADT/SmallPtrSet.h"
23	#include "llvm/Frontend/OpenMP/OMPGridValues.h"
24	#include "llvm/IR/IntrinsicsNVPTX.h"
25
26	using namespace clang;
27	using namespace CodeGen;
28	using namespace llvm::omp;
29
30	namespace {
31	/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32	class NVPTXActionTy final : public PrePostActionTy {
33	llvm::FunctionCallee EnterCallee = nullptr;
34	ArrayRef<llvm::Value *> EnterArgs;
35	llvm::FunctionCallee ExitCallee = nullptr;
36	ArrayRef<llvm::Value *> ExitArgs;
37	bool Conditional = false;
38	llvm::BasicBlock *ContBlock = nullptr;
39
40	public:
41	NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42	ArrayRef<llvm::Value *> EnterArgs,
43	llvm::FunctionCallee ExitCallee,
44	ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45	: EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46	ExitArgs(ExitArgs), Conditional(Conditional) {}
47	void Enter(CodeGenFunction &CGF) override {
48	llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
49	if (Conditional) {
50	llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
51	auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
52	ContBlock = CGF.createBasicBlock("omp_if.end");
53	// Generate the branch (If-stmt)
54	CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
55	CGF.EmitBlock(ThenBlock);
56	}
57	}
58	void Done(CodeGenFunction &CGF) {
59	// Emit the rest of blocks/branches
60	CGF.EmitBranch(ContBlock);
61	CGF.EmitBlock(ContBlock, true);
62	}
63	void Exit(CodeGenFunction &CGF) override {
64	CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
65	}
66	};
67
68	/// A class to track the execution mode when codegening directives within
69	/// a target region. The appropriate mode (SPMD\|NON-SPMD) is set on entry
70	/// to the target region and used by containing directives such as 'parallel'
71	/// to emit optimized code.
72	class ExecutionRuntimeModesRAII {
73	private:
74	CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
75	CGOpenMPRuntimeGPU::EM_Unknown;
76	CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77	bool SavedRuntimeMode = false;
78	bool *RuntimeMode = nullptr;
79
80	public:
81	/// Constructor for Non-SPMD mode.
82	ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode)
83	: ExecMode(ExecMode) {
84	SavedExecMode = ExecMode;
85	ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD;
86	}
87	/// Constructor for SPMD mode.
88	ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
89	bool &RuntimeMode, bool FullRuntimeMode)
90	: ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
91	SavedExecMode = ExecMode;
92	SavedRuntimeMode = RuntimeMode;
93	ExecMode = CGOpenMPRuntimeGPU::EM_SPMD;
94	RuntimeMode = FullRuntimeMode;
95	}
96	~ExecutionRuntimeModesRAII() {
97	ExecMode = SavedExecMode;
98	if (RuntimeMode)
99	*RuntimeMode = SavedRuntimeMode;
100	}
101	};
102
103	/// GPU Configuration: This information can be derived from cuda registers,
104	/// however, providing compile time constants helps generate more efficient
105	/// code. For all practical purposes this is fine because the configuration
106	/// is the same for all known NVPTX architectures.
107	enum MachineConfiguration : unsigned {
108	/// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
109	/// specific Grid Values like GV_Warp_Size, GV_Warp_Size_Log2,
110	/// and GV_Warp_Size_Log2_Mask.
111
112	/// Global memory alignment for performance.
113	GlobalMemoryAlignment = 128,
114
115	/// Maximal size of the shared memory buffer.
116	SharedMemorySize = 128,
117	};
118
119	static const ValueDecl getPrivateItem(const Expr RefExpr) {
120	RefExpr = RefExpr->IgnoreParens();
121	if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
122	const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
123	while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
124	Base = TempASE->getBase()->IgnoreParenImpCasts();
125	RefExpr = Base;
126	} else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
127	const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
128	while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
129	Base = TempOASE->getBase()->IgnoreParenImpCasts();
130	while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
131	Base = TempASE->getBase()->IgnoreParenImpCasts();
132	RefExpr = Base;
133	}
134	RefExpr = RefExpr->IgnoreParenImpCasts();
135	if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
136	return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
137	const auto *ME = cast<MemberExpr>(RefExpr);
138	return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
139	}
140
141
142	static RecordDecl *buildRecordForGlobalizedVars(
143	ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
144	ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
145	llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
146	&MappedDeclsFields, int BufSize) {
147	using VarsDataTy = std::pair<CharUnits /Align/, const ValueDecl *>;
148	if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
149	return nullptr;
150	SmallVector<VarsDataTy, 4> GlobalizedVars;
151	for (const ValueDecl *D : EscapedDecls)
152	GlobalizedVars.emplace_back(
153	CharUnits::fromQuantity(std::max(
154	C.getDeclAlign(D).getQuantity(),
155	static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
156	D);
157	for (const ValueDecl *D : EscapedDeclsForTeams)
158	GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
159	llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
160	return L.first > R.first;
161	});
162
163	// Build struct _globalized_locals_ty {
164	// /* globalized vars */[WarSize] align (max(decl_align,
165	// GlobalMemoryAlignment))
166	// /* globalized vars */ for EscapedDeclsForTeams
167	// };
168	RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
169	GlobalizedRD->startDefinition();
170	llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
171	EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
172	for (const auto &Pair : GlobalizedVars) {
173	const ValueDecl *VD = Pair.second;
174	QualType Type = VD->getType();
175	if (Type->isLValueReferenceType())
176	Type = C.getPointerType(Type.getNonReferenceType());
177	else
178	Type = Type.getNonReferenceType();
179	SourceLocation Loc = VD->getLocation();
180	FieldDecl *Field;
181	if (SingleEscaped.count(VD)) {
182	Field = FieldDecl::Create(
183	C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
184	C.getTrivialTypeSourceInfo(Type, SourceLocation()),
185	/BW=/nullptr, /Mutable=/false,
186	/InitStyle=/ICIS_NoInit);
187	Field->setAccess(AS_public);
188	if (VD->hasAttrs()) {
189	for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
190	E(VD->getAttrs().end());
191	I != E; ++I)
192	Field->addAttr(*I);
193	}
194	} else {
195	llvm::APInt ArraySize(32, BufSize);
196	Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
197	0);
198	Field = FieldDecl::Create(
199	C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
200	C.getTrivialTypeSourceInfo(Type, SourceLocation()),
201	/BW=/nullptr, /Mutable=/false,
202	/InitStyle=/ICIS_NoInit);
203	Field->setAccess(AS_public);
204	llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
205	static_cast<CharUnits::QuantityType>(
206	GlobalMemoryAlignment)));
207	Field->addAttr(AlignedAttr::CreateImplicit(
208	C, /IsAlignmentExpr=/true,
209	IntegerLiteral::Create(C, Align,
210	C.getIntTypeForBitwidth(32, /Signed=/0),
211	SourceLocation()),
212	{}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
213	}
214	GlobalizedRD->addDecl(Field);
215	MappedDeclsFields.try_emplace(VD, Field);
216	}
217	GlobalizedRD->completeDefinition();
218	return GlobalizedRD;
219	}
220
221	/// Get the list of variables that can escape their declaration context.
222	class CheckVarsEscapingDeclContext final
223	: public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
224	CodeGenFunction &CGF;
225	llvm::SetVector<const ValueDecl *> EscapedDecls;
226	llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
227	llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
228	RecordDecl *GlobalizedRD = nullptr;
229	llvm::SmallDenseMap<const ValueDecl , const FieldDecl > MappedDeclsFields;
230	bool AllEscaped = false;
231	bool IsForCombinedParallelRegion = false;
232
233	void markAsEscaped(const ValueDecl *VD) {
234	// Do not globalize declare target variables.
235	if (!isa<VarDecl>(VD) \|\|
236	OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
237	return;
238	VD = cast<ValueDecl>(VD->getCanonicalDecl());
239	// Use user-specified allocation.
240	if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
241	return;
242	// Variables captured by value must be globalized.
243	if (auto *CSI = CGF.CapturedStmtInfo) {
244	if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
245	// Check if need to capture the variable that was already captured by
246	// value in the outer region.
247	if (!IsForCombinedParallelRegion) {
248	if (!FD->hasAttrs())
249	return;
250	const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
251	if (!Attr)
252	return;
253	if (((Attr->getCaptureKind() != OMPC_map) &&
254	!isOpenMPPrivate(Attr->getCaptureKind())) \|\|
255	((Attr->getCaptureKind() == OMPC_map) &&
256	!FD->getType()->isAnyPointerType()))
257	return;
258	}
259	if (!FD->getType()->isReferenceType()) {
260	assert(!VD->getType()->isVariablyModifiedType() &&((void)0)
261	"Parameter captured by value with variably modified type")((void)0);
262	EscapedParameters.insert(VD);
263	} else if (!IsForCombinedParallelRegion) {
264	return;
265	}
266	}
267	}
268	if ((!CGF.CapturedStmtInfo \|\|
269	(IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
270	VD->getType()->isReferenceType())
271	// Do not globalize variables with reference type.
272	return;
273	if (VD->getType()->isVariablyModifiedType())
274	EscapedVariableLengthDecls.insert(VD);
275	else
276	EscapedDecls.insert(VD);
277	}
278
279	void VisitValueDecl(const ValueDecl *VD) {
280	if (VD->getType()->isLValueReferenceType())
281	markAsEscaped(VD);
282	if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
283	if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
284	const bool SavedAllEscaped = AllEscaped;
285	AllEscaped = VD->getType()->isLValueReferenceType();
286	Visit(VarD->getInit());
287	AllEscaped = SavedAllEscaped;
288	}
289	}
290	}
291	void VisitOpenMPCapturedStmt(const CapturedStmt *S,
292	ArrayRef<OMPClause *> Clauses,
293	bool IsCombinedParallelRegion) {
294	if (!S)
295	return;
296	for (const CapturedStmt::Capture &C : S->captures()) {
297	if (C.capturesVariable() && !C.capturesVariableByCopy()) {
298	const ValueDecl *VD = C.getCapturedVar();
299	bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
300	if (IsCombinedParallelRegion) {
301	// Check if the variable is privatized in the combined construct and
302	// those private copies must be shared in the inner parallel
303	// directive.
304	IsForCombinedParallelRegion = false;
305	for (const OMPClause *C : Clauses) {
306	if (!isOpenMPPrivate(C->getClauseKind()) \|\|
307	C->getClauseKind() == OMPC_reduction \|\|
308	C->getClauseKind() == OMPC_linear \|\|
309	C->getClauseKind() == OMPC_private)
310	continue;
311	ArrayRef<const Expr *> Vars;
312	if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
313	Vars = PC->getVarRefs();
314	else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
315	Vars = PC->getVarRefs();
316	else
317	llvm_unreachable("Unexpected clause.")__builtin_unreachable();
318	for (const auto *E : Vars) {
319	const Decl *D =
320	cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
321	if (D == VD->getCanonicalDecl()) {
322	IsForCombinedParallelRegion = true;
323	break;
324	}
325	}
326	if (IsForCombinedParallelRegion)
327	break;
328	}
329	}
330	markAsEscaped(VD);
331	if (isa<OMPCapturedExprDecl>(VD))
332	VisitValueDecl(VD);
333	IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
334	}
335	}
336	}
337
338	void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
339	assert(!GlobalizedRD &&((void)0)
340	"Record for globalized variables is built already.")((void)0);
341	ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
342	unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
343	if (IsInTTDRegion)
344	EscapedDeclsForTeams = EscapedDecls.getArrayRef();
345	else
346	EscapedDeclsForParallel = EscapedDecls.getArrayRef();
347	GlobalizedRD = ::buildRecordForGlobalizedVars(
348	CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
349	MappedDeclsFields, WarpSize);
350	}
351
352	public:
353	CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
354	ArrayRef<const ValueDecl *> TeamsReductions)
355	: CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
356	}
357	virtual ~CheckVarsEscapingDeclContext() = default;
358	void VisitDeclStmt(const DeclStmt *S) {
359	if (!S)
360	return;
361	for (const Decl *D : S->decls())
362	if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
363	VisitValueDecl(VD);
364	}
365	void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
366	if (!D)
367	return;
368	if (!D->hasAssociatedStmt())
369	return;
370	if (const auto *S =
371	dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
372	// Do not analyze directives that do not actually require capturing,
373	// like `omp for` or `omp simd` directives.
374	llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
375	getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
376	if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
377	VisitStmt(S->getCapturedStmt());
378	return;
379	}
380	VisitOpenMPCapturedStmt(
381	S, D->clauses(),
382	CaptureRegions.back() == OMPD_parallel &&
383	isOpenMPDistributeDirective(D->getDirectiveKind()));
384	}
385	}
386	void VisitCapturedStmt(const CapturedStmt *S) {
387	if (!S)
388	return;
389	for (const CapturedStmt::Capture &C : S->captures()) {
390	if (C.capturesVariable() && !C.capturesVariableByCopy()) {
391	const ValueDecl *VD = C.getCapturedVar();
392	markAsEscaped(VD);
393	if (isa<OMPCapturedExprDecl>(VD))
394	VisitValueDecl(VD);
395	}
396	}
397	}
398	void VisitLambdaExpr(const LambdaExpr *E) {
399	if (!E)
400	return;
401	for (const LambdaCapture &C : E->captures()) {
402	if (C.capturesVariable()) {
403	if (C.getCaptureKind() == LCK_ByRef) {
404	const ValueDecl *VD = C.getCapturedVar();
405	markAsEscaped(VD);
406	if (E->isInitCapture(&C) \|\| isa<OMPCapturedExprDecl>(VD))
407	VisitValueDecl(VD);
408	}
409	}
410	}
411	}
412	void VisitBlockExpr(const BlockExpr *E) {
413	if (!E)
414	return;
415	for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
416	if (C.isByRef()) {
417	const VarDecl *VD = C.getVariable();
418	markAsEscaped(VD);
419	if (isa<OMPCapturedExprDecl>(VD) \|\| VD->isInitCapture())
420	VisitValueDecl(VD);
421	}
422	}
423	}
424	void VisitCallExpr(const CallExpr *E) {
425	if (!E)
426	return;
427	for (const Expr *Arg : E->arguments()) {
428	if (!Arg)
429	continue;
430	if (Arg->isLValue()) {
431	const bool SavedAllEscaped = AllEscaped;
432	AllEscaped = true;
433	Visit(Arg);
434	AllEscaped = SavedAllEscaped;
435	} else {
436	Visit(Arg);
437	}
438	}
439	Visit(E->getCallee());
440	}
441	void VisitDeclRefExpr(const DeclRefExpr *E) {
442	if (!E)
443	return;
444	const ValueDecl *VD = E->getDecl();
445	if (AllEscaped)
446	markAsEscaped(VD);
447	if (isa<OMPCapturedExprDecl>(VD))
448	VisitValueDecl(VD);
449	else if (const auto *VarD = dyn_cast<VarDecl>(VD))
450	if (VarD->isInitCapture())
451	VisitValueDecl(VD);
452	}
453	void VisitUnaryOperator(const UnaryOperator *E) {
454	if (!E)
455	return;
456	if (E->getOpcode() == UO_AddrOf) {
457	const bool SavedAllEscaped = AllEscaped;
458	AllEscaped = true;
459	Visit(E->getSubExpr());
460	AllEscaped = SavedAllEscaped;
461	} else {
462	Visit(E->getSubExpr());
463	}
464	}
465	void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
466	if (!E)
467	return;
468	if (E->getCastKind() == CK_ArrayToPointerDecay) {
469	const bool SavedAllEscaped = AllEscaped;
470	AllEscaped = true;
471	Visit(E->getSubExpr());
472	AllEscaped = SavedAllEscaped;
473	} else {
474	Visit(E->getSubExpr());
475	}
476	}
477	void VisitExpr(const Expr *E) {
478	if (!E)
479	return;
480	bool SavedAllEscaped = AllEscaped;
481	if (!E->isLValue())
482	AllEscaped = false;
483	for (const Stmt *Child : E->children())
484	if (Child)
485	Visit(Child);
486	AllEscaped = SavedAllEscaped;
487	}
488	void VisitStmt(const Stmt *S) {
489	if (!S)
490	return;
491	for (const Stmt *Child : S->children())
492	if (Child)
493	Visit(Child);
494	}
495
496	/// Returns the record that handles all the escaped local variables and used
497	/// instead of their original storage.
498	const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
499	if (!GlobalizedRD)
500	buildRecordForGlobalizedVars(IsInTTDRegion);
501	return GlobalizedRD;
502	}
503
504	/// Returns the field in the globalized record for the escaped variable.
505	const FieldDecl getFieldForGlobalizedVar(const ValueDecl VD) const {
506	assert(GlobalizedRD &&((void)0)
507	"Record for globalized variables must be generated already.")((void)0);
508	auto I = MappedDeclsFields.find(VD);
509	if (I == MappedDeclsFields.end())
510	return nullptr;
511	return I->getSecond();
512	}
513
514	/// Returns the list of the escaped local variables/parameters.
515	ArrayRef<const ValueDecl *> getEscapedDecls() const {
516	return EscapedDecls.getArrayRef();
517	}
518
519	/// Checks if the escaped local variable is actually a parameter passed by
520	/// value.
521	const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
522	return EscapedParameters;
523	}
524
525	/// Returns the list of the escaped variables with the variably modified
526	/// types.
527	ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
528	return EscapedVariableLengthDecls.getArrayRef();
529	}
530	};
531	} // anonymous namespace
532
533	/// Get the id of the warp in the block.
534	/// We assume that the warp size is 32, which is always the case
535	/// on the NVPTX device, to generate more efficient code.
536	static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
537	CGBuilderTy &Bld = CGF.Builder;
538	unsigned LaneIDBits =
539	CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2);
540	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
541	return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
542	}
543
544	/// Get the id of the current lane in the Warp.
545	/// We assume that the warp size is 32, which is always the case
546	/// on the NVPTX device, to generate more efficient code.
547	static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
548	CGBuilderTy &Bld = CGF.Builder;
549	unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue(
550	llvm::omp::GV_Warp_Size_Log2_Mask);
551	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
552	return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
553	"nvptx_lane_id");
554	}
555
556	CGOpenMPRuntimeGPU::ExecutionMode
557	CGOpenMPRuntimeGPU::getExecutionMode() const {
558	return CurrentExecutionMode;
559	}
560
561	static CGOpenMPRuntimeGPU::DataSharingMode
562	getDataSharingMode(CodeGenModule &CGM) {
563	return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
564	: CGOpenMPRuntimeGPU::Generic;
565	}
566
567	/// Check for inner (nested) SPMD construct, if any
568	static bool hasNestedSPMDDirective(ASTContext &Ctx,
569	const OMPExecutableDirective &D) {
570	const auto *CS = D.getInnermostCapturedStmt();
571	const auto *Body =
572	CS->getCapturedStmt()->IgnoreContainers(/IgnoreCaptured=/true);
573	const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
574
575	if (const auto *NestedDir =
576	dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
577	OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
578	switch (D.getDirectiveKind()) {
579	case OMPD_target:
580	if (isOpenMPParallelDirective(DKind))
581	return true;
582	if (DKind == OMPD_teams) {
583	Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
584	/IgnoreCaptured=/true);
585	if (!Body)
586	return false;
587	ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
588	if (const auto *NND =
589	dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
590	DKind = NND->getDirectiveKind();
591	if (isOpenMPParallelDirective(DKind))
592	return true;
593	}
594	}
595	return false;
596	case OMPD_target_teams:
597	return isOpenMPParallelDirective(DKind);
598	case OMPD_target_simd:
599	case OMPD_target_parallel:
600	case OMPD_target_parallel_for:
601	case OMPD_target_parallel_for_simd:
602	case OMPD_target_teams_distribute:
603	case OMPD_target_teams_distribute_simd:
604	case OMPD_target_teams_distribute_parallel_for:
605	case OMPD_target_teams_distribute_parallel_for_simd:
606	case OMPD_parallel:
607	case OMPD_for:
608	case OMPD_parallel_for:
609	case OMPD_parallel_master:
610	case OMPD_parallel_sections:
611	case OMPD_for_simd:
612	case OMPD_parallel_for_simd:
613	case OMPD_cancel:
614	case OMPD_cancellation_point:
615	case OMPD_ordered:
616	case OMPD_threadprivate:
617	case OMPD_allocate:
618	case OMPD_task:
619	case OMPD_simd:
620	case OMPD_sections:
621	case OMPD_section:
622	case OMPD_single:
623	case OMPD_master:
624	case OMPD_critical:
625	case OMPD_taskyield:
626	case OMPD_barrier:
627	case OMPD_taskwait:
628	case OMPD_taskgroup:
629	case OMPD_atomic:
630	case OMPD_flush:
631	case OMPD_depobj:
632	case OMPD_scan:
633	case OMPD_teams:
634	case OMPD_target_data:
635	case OMPD_target_exit_data:
636	case OMPD_target_enter_data:
637	case OMPD_distribute:
638	case OMPD_distribute_simd:
639	case OMPD_distribute_parallel_for:
640	case OMPD_distribute_parallel_for_simd:
641	case OMPD_teams_distribute:
642	case OMPD_teams_distribute_simd:
643	case OMPD_teams_distribute_parallel_for:
644	case OMPD_teams_distribute_parallel_for_simd:
645	case OMPD_target_update:
646	case OMPD_declare_simd:
647	case OMPD_declare_variant:
648	case OMPD_begin_declare_variant:
649	case OMPD_end_declare_variant:
650	case OMPD_declare_target:
651	case OMPD_end_declare_target:
652	case OMPD_declare_reduction:
653	case OMPD_declare_mapper:
654	case OMPD_taskloop:
655	case OMPD_taskloop_simd:
656	case OMPD_master_taskloop:
657	case OMPD_master_taskloop_simd:
658	case OMPD_parallel_master_taskloop:
659	case OMPD_parallel_master_taskloop_simd:
660	case OMPD_requires:
661	case OMPD_unknown:
662	default:
663	llvm_unreachable("Unexpected directive.")__builtin_unreachable();
664	}
665	}
666
667	return false;
668	}
669
670	static bool supportsSPMDExecutionMode(ASTContext &Ctx,
671	const OMPExecutableDirective &D) {
672	OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
673	switch (DirectiveKind) {
674	case OMPD_target:
675	case OMPD_target_teams:
676	return hasNestedSPMDDirective(Ctx, D);
677	case OMPD_target_parallel:
678	case OMPD_target_parallel_for:
679	case OMPD_target_parallel_for_simd:
680	case OMPD_target_teams_distribute_parallel_for:
681	case OMPD_target_teams_distribute_parallel_for_simd:
682	case OMPD_target_simd:
683	case OMPD_target_teams_distribute_simd:
684	return true;
685	case OMPD_target_teams_distribute:
686	return false;
687	case OMPD_parallel:
688	case OMPD_for:
689	case OMPD_parallel_for:
690	case OMPD_parallel_master:
691	case OMPD_parallel_sections:
692	case OMPD_for_simd:
693	case OMPD_parallel_for_simd:
694	case OMPD_cancel:
695	case OMPD_cancellation_point:
696	case OMPD_ordered:
697	case OMPD_threadprivate:
698	case OMPD_allocate:
699	case OMPD_task:
700	case OMPD_simd:
701	case OMPD_sections:
702	case OMPD_section:
703	case OMPD_single:
704	case OMPD_master:
705	case OMPD_critical:
706	case OMPD_taskyield:
707	case OMPD_barrier:
708	case OMPD_taskwait:
709	case OMPD_taskgroup:
710	case OMPD_atomic:
711	case OMPD_flush:
712	case OMPD_depobj:
713	case OMPD_scan:
714	case OMPD_teams:
715	case OMPD_target_data:
716	case OMPD_target_exit_data:
717	case OMPD_target_enter_data:
718	case OMPD_distribute:
719	case OMPD_distribute_simd:
720	case OMPD_distribute_parallel_for:
721	case OMPD_distribute_parallel_for_simd:
722	case OMPD_teams_distribute:
723	case OMPD_teams_distribute_simd:
724	case OMPD_teams_distribute_parallel_for:
725	case OMPD_teams_distribute_parallel_for_simd:
726	case OMPD_target_update:
727	case OMPD_declare_simd:
728	case OMPD_declare_variant:
729	case OMPD_begin_declare_variant:
730	case OMPD_end_declare_variant:
731	case OMPD_declare_target:
732	case OMPD_end_declare_target:
733	case OMPD_declare_reduction:
734	case OMPD_declare_mapper:
735	case OMPD_taskloop:
736	case OMPD_taskloop_simd:
737	case OMPD_master_taskloop:
738	case OMPD_master_taskloop_simd:
739	case OMPD_parallel_master_taskloop:
740	case OMPD_parallel_master_taskloop_simd:
741	case OMPD_requires:
742	case OMPD_unknown:
743	default:
744	break;
745	}
746	llvm_unreachable(__builtin_unreachable()
747	"Unknown programming model for OpenMP directive on NVPTX target.")__builtin_unreachable();
748	}
749
750	/// Check if the directive is loops based and has schedule clause at all or has
751	/// static scheduling.
752	static bool hasStaticScheduling(const OMPExecutableDirective &D) {
753	assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&((void)0)
754	isOpenMPLoopDirective(D.getDirectiveKind()) &&((void)0)
755	"Expected loop-based directive.")((void)0);
756	return !D.hasClausesOfKind<OMPOrderedClause>() &&
757	(!D.hasClausesOfKind<OMPScheduleClause>() \|\|
758	llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
759	[](const OMPScheduleClause *C) {
760	return C->getScheduleKind() == OMPC_SCHEDULE_static;
761	}));
762	}
763
764	/// Check for inner (nested) lightweight runtime construct, if any
765	static bool hasNestedLightweightDirective(ASTContext &Ctx,
766	const OMPExecutableDirective &D) {
767	assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.")((void)0);
768	const auto *CS = D.getInnermostCapturedStmt();
769	const auto *Body =
770	CS->getCapturedStmt()->IgnoreContainers(/IgnoreCaptured=/true);
771	const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
772
773	if (const auto *NestedDir =
774	dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
775	OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
776	switch (D.getDirectiveKind()) {
777	case OMPD_target:
778	if (isOpenMPParallelDirective(DKind) &&
779	isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
780	hasStaticScheduling(*NestedDir))
781	return true;
782	if (DKind == OMPD_teams_distribute_simd \|\| DKind == OMPD_simd)
783	return true;
784	if (DKind == OMPD_parallel) {
785	Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
786	/IgnoreCaptured=/true);
787	if (!Body)
788	return false;
789	ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
790	if (const auto *NND =
791	dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
792	DKind = NND->getDirectiveKind();
793	if (isOpenMPWorksharingDirective(DKind) &&
794	isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
795	return true;
796	}
797	} else if (DKind == OMPD_teams) {
798	Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
799	/IgnoreCaptured=/true);
800	if (!Body)
801	return false;
802	ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
803	if (const auto *NND =
804	dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
805	DKind = NND->getDirectiveKind();
806	if (isOpenMPParallelDirective(DKind) &&
807	isOpenMPWorksharingDirective(DKind) &&
808	isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
809	return true;
810	if (DKind == OMPD_parallel) {
811	Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
812	/IgnoreCaptured=/true);
813	if (!Body)
814	return false;
815	ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
816	if (const auto *NND =
817	dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
818	DKind = NND->getDirectiveKind();
819	if (isOpenMPWorksharingDirective(DKind) &&
820	isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
821	return true;
822	}
823	}
824	}
825	}
826	return false;
827	case OMPD_target_teams:
828	if (isOpenMPParallelDirective(DKind) &&
829	isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
830	hasStaticScheduling(*NestedDir))
831	return true;
832	if (DKind == OMPD_distribute_simd \|\| DKind == OMPD_simd)
833	return true;
834	if (DKind == OMPD_parallel) {
835	Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
836	/IgnoreCaptured=/true);
837	if (!Body)
838	return false;
839	ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
840	if (const auto *NND =
841	dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
842	DKind = NND->getDirectiveKind();
843	if (isOpenMPWorksharingDirective(DKind) &&
844	isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
845	return true;
846	}
847	}
848	return false;
849	case OMPD_target_parallel:
850	if (DKind == OMPD_simd)
851	return true;
852	return isOpenMPWorksharingDirective(DKind) &&
853	isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
854	case OMPD_target_teams_distribute:
855	case OMPD_target_simd:
856	case OMPD_target_parallel_for:
857	case OMPD_target_parallel_for_simd:
858	case OMPD_target_teams_distribute_simd:
859	case OMPD_target_teams_distribute_parallel_for:
860	case OMPD_target_teams_distribute_parallel_for_simd:
861	case OMPD_parallel:
862	case OMPD_for:
863	case OMPD_parallel_for:
864	case OMPD_parallel_master:
865	case OMPD_parallel_sections:
866	case OMPD_for_simd:
867	case OMPD_parallel_for_simd:
868	case OMPD_cancel:
869	case OMPD_cancellation_point:
870	case OMPD_ordered:
871	case OMPD_threadprivate:
872	case OMPD_allocate:
873	case OMPD_task:
874	case OMPD_simd:
875	case OMPD_sections:
876	case OMPD_section:
877	case OMPD_single:
878	case OMPD_master:
879	case OMPD_critical:
880	case OMPD_taskyield:
881	case OMPD_barrier:
882	case OMPD_taskwait:
883	case OMPD_taskgroup:
884	case OMPD_atomic:
885	case OMPD_flush:
886	case OMPD_depobj:
887	case OMPD_scan:
888	case OMPD_teams:
889	case OMPD_target_data:
890	case OMPD_target_exit_data:
891	case OMPD_target_enter_data:
892	case OMPD_distribute:
893	case OMPD_distribute_simd:
894	case OMPD_distribute_parallel_for:
895	case OMPD_distribute_parallel_for_simd:
896	case OMPD_teams_distribute:
897	case OMPD_teams_distribute_simd:
898	case OMPD_teams_distribute_parallel_for:
899	case OMPD_teams_distribute_parallel_for_simd:
900	case OMPD_target_update:
901	case OMPD_declare_simd:
902	case OMPD_declare_variant:
903	case OMPD_begin_declare_variant:
904	case OMPD_end_declare_variant:
905	case OMPD_declare_target:
906	case OMPD_end_declare_target:
907	case OMPD_declare_reduction:
908	case OMPD_declare_mapper:
909	case OMPD_taskloop:
910	case OMPD_taskloop_simd:
911	case OMPD_master_taskloop:
912	case OMPD_master_taskloop_simd:
913	case OMPD_parallel_master_taskloop:
914	case OMPD_parallel_master_taskloop_simd:
915	case OMPD_requires:
916	case OMPD_unknown:
917	default:
918	llvm_unreachable("Unexpected directive.")__builtin_unreachable();
919	}
920	}
921
922	return false;
923	}
924
925	/// Checks if the construct supports lightweight runtime. It must be SPMD
926	/// construct + inner loop-based construct with static scheduling.
927	static bool supportsLightweightRuntime(ASTContext &Ctx,
928	const OMPExecutableDirective &D) {
929	if (!supportsSPMDExecutionMode(Ctx, D))
930	return false;
931	OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
932	switch (DirectiveKind) {
933	case OMPD_target:
934	case OMPD_target_teams:
935	case OMPD_target_parallel:
936	return hasNestedLightweightDirective(Ctx, D);
937	case OMPD_target_parallel_for:
938	case OMPD_target_parallel_for_simd:
939	case OMPD_target_teams_distribute_parallel_for:
940	case OMPD_target_teams_distribute_parallel_for_simd:
941	// (Last\|First)-privates must be shared in parallel region.
942	return hasStaticScheduling(D);
943	case OMPD_target_simd:
944	case OMPD_target_teams_distribute_simd:
945	return true;
946	case OMPD_target_teams_distribute:
947	return false;
948	case OMPD_parallel:
949	case OMPD_for:
950	case OMPD_parallel_for:
951	case OMPD_parallel_master:
952	case OMPD_parallel_sections:
953	case OMPD_for_simd:
954	case OMPD_parallel_for_simd:
955	case OMPD_cancel:
956	case OMPD_cancellation_point:
957	case OMPD_ordered:
958	case OMPD_threadprivate:
959	case OMPD_allocate:
960	case OMPD_task:
961	case OMPD_simd:
962	case OMPD_sections:
963	case OMPD_section:
964	case OMPD_single:
965	case OMPD_master:
966	case OMPD_critical:
967	case OMPD_taskyield:
968	case OMPD_barrier:
969	case OMPD_taskwait:
970	case OMPD_taskgroup:
971	case OMPD_atomic:
972	case OMPD_flush:
973	case OMPD_depobj:
974	case OMPD_scan:
975	case OMPD_teams:
976	case OMPD_target_data:
977	case OMPD_target_exit_data:
978	case OMPD_target_enter_data:
979	case OMPD_distribute:
980	case OMPD_distribute_simd:
981	case OMPD_distribute_parallel_for:
982	case OMPD_distribute_parallel_for_simd:
983	case OMPD_teams_distribute:
984	case OMPD_teams_distribute_simd:
985	case OMPD_teams_distribute_parallel_for:
986	case OMPD_teams_distribute_parallel_for_simd:
987	case OMPD_target_update:
988	case OMPD_declare_simd:
989	case OMPD_declare_variant:
990	case OMPD_begin_declare_variant:
991	case OMPD_end_declare_variant:
992	case OMPD_declare_target:
993	case OMPD_end_declare_target:
994	case OMPD_declare_reduction:
995	case OMPD_declare_mapper:
996	case OMPD_taskloop:
997	case OMPD_taskloop_simd:
998	case OMPD_master_taskloop:
999	case OMPD_master_taskloop_simd:
1000	case OMPD_parallel_master_taskloop:
1001	case OMPD_parallel_master_taskloop_simd:
1002	case OMPD_requires:
1003	case OMPD_unknown:
1004	default:
1005	break;
1006	}
1007	llvm_unreachable(__builtin_unreachable()
1008	"Unknown programming model for OpenMP directive on NVPTX target.")__builtin_unreachable();
1009	}
1010
1011	void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
1012	StringRef ParentName,
1013	llvm::Function *&OutlinedFn,
1014	llvm::Constant *&OutlinedFnID,
1015	bool IsOffloadEntry,
1016	const RegionCodeGenTy &CodeGen) {
1017	ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1018	EntryFunctionState EST;
1019	WrapperFunctionsMap.clear();
1020
1021	// Emit target region as a standalone region.
1022	class NVPTXPrePostActionTy : public PrePostActionTy {
1023	CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1024
1025	public:
1026	NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
1027	: EST(EST) {}
1028	void Enter(CodeGenFunction &CGF) override {
1029	auto &RT =
1030	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1031	RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
1032	// Skip target region initialization.
1033	RT.setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);
1034	}
1035	void Exit(CodeGenFunction &CGF) override {
1036	auto &RT =
1037	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1038	RT.clearLocThreadIdInsertPt(CGF);
1039	RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
1040	}
1041	} Action(EST);
1042	CodeGen.setAction(Action);
1043	IsInTTDRegion = true;
1044	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1045	IsOffloadEntry, CodeGen);
1046	IsInTTDRegion = false;
1047	}
1048
1049	void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
1050	EntryFunctionState &EST, bool IsSPMD) {
1051	CGBuilderTy &Bld = CGF.Builder;
1052	Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD, requiresFullRuntime()));
1053	IsInTargetMasterThreadRegion = IsSPMD;
1054	if (!IsSPMD)
1055	emitGenericVarsProlog(CGF, EST.Loc);
1056	}
1057
1058	void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
1059	EntryFunctionState &EST,
1060	bool IsSPMD) {
1061	if (!IsSPMD)
1062	emitGenericVarsEpilog(CGF);
1063
1064	CGBuilderTy &Bld = CGF.Builder;
1065	OMPBuilder.createTargetDeinit(Bld, IsSPMD, requiresFullRuntime());
1066	}
1067
1068	void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
1069	StringRef ParentName,
1070	llvm::Function *&OutlinedFn,
1071	llvm::Constant *&OutlinedFnID,
1072	bool IsOffloadEntry,
1073	const RegionCodeGenTy &CodeGen) {
1074	ExecutionRuntimeModesRAII ModeRAII(
1075	CurrentExecutionMode, RequiresFullRuntime,
1076	CGM.getLangOpts().OpenMPCUDAForceFullRuntime \|\|
1077	!supportsLightweightRuntime(CGM.getContext(), D));
1078	EntryFunctionState EST;
1079
1080	// Emit target region as a standalone region.
1081	class NVPTXPrePostActionTy : public PrePostActionTy {
1082	CGOpenMPRuntimeGPU &RT;
1083	CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1084
1085	public:
1086	NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
1087	CGOpenMPRuntimeGPU::EntryFunctionState &EST)
1088	: RT(RT), EST(EST) {}
1089	void Enter(CodeGenFunction &CGF) override {
1090	RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
1091	// Skip target region initialization.
1092	RT.setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);
1093	}
1094	void Exit(CodeGenFunction &CGF) override {
1095	RT.clearLocThreadIdInsertPt(CGF);
1096	RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
1097	}
1098	} Action(*this, EST);
1099	CodeGen.setAction(Action);
1100	IsInTTDRegion = true;
1101	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1102	IsOffloadEntry, CodeGen);
1103	IsInTTDRegion = false;
1104	}
1105
1106	// Create a unique global variable to indicate the execution mode of this target
1107	// region. The execution mode is either 'generic', or 'spmd' depending on the
1108	// target directive. This variable is picked up by the offload library to setup
1109	// the device appropriately before kernel launch. If the execution mode is
1110	// 'generic', the runtime reserves one warp for the master, otherwise, all
1111	// warps participate in parallel work.
1112	static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1113	bool Mode) {
1114	auto *GVMode =
1115	new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /isConstant=/true,
1116	llvm::GlobalValue::WeakAnyLinkage,
1117	llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1118	Twine(Name, "_exec_mode"));
1119	CGM.addCompilerUsedGlobal(GVMode);
1120	}
1121
1122	void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
1123	llvm::Constant *Addr,
1124	uint64_t Size, int32_t,
1125	llvm::GlobalValue::LinkageTypes) {
1126	// TODO: Add support for global variables on the device after declare target
1127	// support.
1128	if (!isa<llvm::Function>(Addr))
1129	return;
1130	llvm::Module &M = CGM.getModule();
1131	llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1132
1133	// Get "nvvm.annotations" metadata node
1134	llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1135
1136	llvm::Metadata *MDVals[] = {
1137	llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1138	llvm::ConstantAsMetadata::get(
1139	llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1140	// Append metadata to nvvm.annotations
1141	MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1142	}
1143
1144	void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
1145	const OMPExecutableDirective &D, StringRef ParentName,
1146	llvm::Function &OutlinedFn, llvm::Constant &OutlinedFnID,
1147	bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1148	if (!IsOffloadEntry) // Nothing to do.
1149	return;
1150
1151	assert(!ParentName.empty() && "Invalid target region parent name!")((void)0);
1152
1153	bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1154	if (Mode)
1155	emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1156	CodeGen);
1157	else
1158	emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1159	CodeGen);
1160
1161	setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1162	}
1163
1164	namespace {
1165	LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()using ::llvm::BitmaskEnumDetail::operator~; using ::llvm::BitmaskEnumDetail ::operator\|; using ::llvm::BitmaskEnumDetail::operator&; using ::llvm::BitmaskEnumDetail::operator^; using ::llvm::BitmaskEnumDetail ::operator\|=; using ::llvm::BitmaskEnumDetail::operator&= ; using ::llvm::BitmaskEnumDetail::operator^=;
1166	/// Enum for accesseing the reserved_2 field of the ident_t struct.
1167	enum ModeFlagsTy : unsigned {
1168	/// Bit set to 1 when in SPMD mode.
1169	KMP_IDENT_SPMD_MODE = 0x01,
1170	/// Bit set to 1 when a simplified runtime is used.
1171	KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1172	LLVM_MARK_AS_BITMASK_ENUM(/LargestValue=/KMP_IDENT_SIMPLE_RT_MODE)LLVM_BITMASK_LARGEST_ENUMERATOR = KMP_IDENT_SIMPLE_RT_MODE
1173	};
1174
1175	/// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1176	static const ModeFlagsTy UndefinedMode =
1177	(~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1178	} // anonymous namespace
1179
1180	unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const {
1181	switch (getExecutionMode()) {
1182	case EM_SPMD:
1183	if (requiresFullRuntime())
1184	return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1185	return KMP_IDENT_SPMD_MODE \| KMP_IDENT_SIMPLE_RT_MODE;
1186	case EM_NonSPMD:
1187	assert(requiresFullRuntime() && "Expected full runtime.")((void)0);
1188	return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1189	case EM_Unknown:
1190	return UndefinedMode;
1191	}
1192	llvm_unreachable("Unknown flags are requested.")__builtin_unreachable();
1193	}
1194
1195	CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
1196	: CGOpenMPRuntime(CGM, "_", "$") {
1197	if (!CGM.getLangOpts().OpenMPIsDevice)
1198	llvm_unreachable("OpenMP NVPTX can only handle device code.")__builtin_unreachable();
1199	}
1200
1201	void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
1202	ProcBindKind ProcBind,
1203	SourceLocation Loc) {
1204	// Do nothing in case of SPMD mode and L0 parallel.
1205	if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1206	return;
1207
1208	CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1209	}
1210
1211	void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
1212	llvm::Value *NumThreads,
1213	SourceLocation Loc) {
1214	// Do nothing in case of SPMD mode and L0 parallel.
1215	if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1216	return;
1217
1218	CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1219	}
1220
1221	void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
1222	const Expr *NumTeams,
1223	const Expr *ThreadLimit,
1224	SourceLocation Loc) {}
1225
1226	llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
1227	const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1228	OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1229	// Emit target region as a standalone region.
1230	class NVPTXPrePostActionTy : public PrePostActionTy {
1231	bool &IsInParallelRegion;
1232	bool PrevIsInParallelRegion;
1233
1234	public:
1235	NVPTXPrePostActionTy(bool &IsInParallelRegion)
1236	: IsInParallelRegion(IsInParallelRegion) {}
1237	void Enter(CodeGenFunction &CGF) override {
1238	PrevIsInParallelRegion = IsInParallelRegion;
1239	IsInParallelRegion = true;
1240	}
1241	void Exit(CodeGenFunction &CGF) override {
1242	IsInParallelRegion = PrevIsInParallelRegion;
1243	}
1244	} Action(IsInParallelRegion);
1245	CodeGen.setAction(Action);
1246	bool PrevIsInTTDRegion = IsInTTDRegion;
1247	IsInTTDRegion = false;
1248	bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1249	IsInTargetMasterThreadRegion = false;
1250	auto *OutlinedFun =
1251	cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1252	D, ThreadIDVar, InnermostKind, CodeGen));
1253	IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1254	IsInTTDRegion = PrevIsInTTDRegion;
1255	if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD &&
1256	!IsInParallelRegion) {
1257	llvm::Function *WrapperFun =
1258	createParallelDataSharingWrapper(OutlinedFun, D);
1259	WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1260	}
1261
1262	return OutlinedFun;
1263	}
1264
1265	/// Get list of lastprivate variables from the teams distribute ... or
1266	/// teams {distribute ...} directives.
1267	static void
1268	getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1269	llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1270	assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&((void)0)
1271	"expected teams directive.")((void)0);
1272	const OMPExecutableDirective *Dir = &D;
1273	if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
1274	if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
1275	Ctx,
1276	D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1277	/IgnoreCaptured=/true))) {
1278	Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
1279	if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1280	Dir = nullptr;
1281	}
1282	}
1283	if (!Dir)
1284	return;
1285	for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
1286	for (const Expr *E : C->getVarRefs())
1287	Vars.push_back(getPrivateItem(E));
1288	}
1289	}
1290
1291	/// Get list of reduction variables from the teams ... directives.
1292	static void
1293	getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1294	llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1295	assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&((void)0)
1296	"expected teams directive.")((void)0);
1297	for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
1298	for (const Expr *E : C->privates())
1299	Vars.push_back(getPrivateItem(E));
1300	}
1301	}
1302
1303	llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
1304	const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1305	OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1306	SourceLocation Loc = D.getBeginLoc();
1307
1308	const RecordDecl *GlobalizedRD = nullptr;
1309	llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1310	llvm::SmallDenseMap<const ValueDecl , const FieldDecl > MappedDeclsFields;
1311	unsigned WarpSize = CGM.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
1312	// Globalize team reductions variable unconditionally in all modes.
1313	if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1314	getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1315	if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1316	getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
1317	if (!LastPrivatesReductions.empty()) {
1318	GlobalizedRD = ::buildRecordForGlobalizedVars(
1319	CGM.getContext(), llvm::None, LastPrivatesReductions,
1320	MappedDeclsFields, WarpSize);
1321	}
1322	} else if (!LastPrivatesReductions.empty()) {
1323	assert(!TeamAndReductions.first &&((void)0)
1324	"Previous team declaration is not expected.")((void)0);
1325	TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1326	std::swap(TeamAndReductions.second, LastPrivatesReductions);
1327	}
1328
1329	// Emit target region as a standalone region.
1330	class NVPTXPrePostActionTy : public PrePostActionTy {
1331	SourceLocation &Loc;
1332	const RecordDecl *GlobalizedRD;
1333	llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
1334	&MappedDeclsFields;
1335
1336	public:
1337	NVPTXPrePostActionTy(
1338	SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1339	llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
1340	&MappedDeclsFields)
1341	: Loc(Loc), GlobalizedRD(GlobalizedRD),
1342	MappedDeclsFields(MappedDeclsFields) {}
1343	void Enter(CodeGenFunction &CGF) override {
1344	auto &Rt =
1345	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1346	if (GlobalizedRD) {
1347	auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1348	I->getSecond().MappedParams =
1349	std::make_unique<CodeGenFunction::OMPMapVars>();
1350	DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1351	for (const auto &Pair : MappedDeclsFields) {
1352	assert(Pair.getFirst()->isCanonicalDecl() &&((void)0)
1353	"Expected canonical declaration")((void)0);
1354	Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1355	}
1356	}
1357	Rt.emitGenericVarsProlog(CGF, Loc);
1358	}
1359	void Exit(CodeGenFunction &CGF) override {
1360	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1361	.emitGenericVarsEpilog(CGF);
1362	}
1363	} Action(Loc, GlobalizedRD, MappedDeclsFields);
1364	CodeGen.setAction(Action);
1365	llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1366	D, ThreadIDVar, InnermostKind, CodeGen);
1367
1368	return OutlinedFun;
1369	}
1370
1371	void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1372	SourceLocation Loc,
1373	bool WithSPMDCheck) {
1374	if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1375	getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1376	return;
1377
1378	CGBuilderTy &Bld = CGF.Builder;
1379
1380	const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1381	if (I == FunctionGlobalizedDecls.end())
1382	return;
1383
1384	for (auto &Rec : I->getSecond().LocalVarData) {
1385	const auto *VD = cast<VarDecl>(Rec.first);
1386	bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1387	QualType VarTy = VD->getType();
1388
1389	// Get the local allocation of a firstprivate variable before sharing
1390	llvm::Value *ParValue;
1391	if (EscapedParam) {
1392	LValue ParLVal =
1393	CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1394	ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1395	}
1396
1397	// Allocate space for the variable to be globalized
1398	llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1399	llvm::Instruction *VoidPtr =
1400	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1401	CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1402	AllocArgs, VD->getName());
1403
1404	// Cast the void pointer and get the address of the globalized variable.
1405	llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1406	llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1407	VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1408	LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
1409	Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1410	Rec.second.GlobalizedVal = VoidPtr;
1411
1412	// Assign the local allocation to the newly globalized location.
1413	if (EscapedParam) {
1414	CGF.EmitStoreOfScalar(ParValue, VarAddr);
1415	I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
1416	}
1417	if (auto *DI = CGF.getDebugInfo())
1418	VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1419	}
1420	for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
1421	// Use actual memory size of the VLA object including the padding
1422	// for alignment purposes.
1423	llvm::Value *Size = CGF.getTypeSize(VD->getType());
1424	CharUnits Align = CGM.getContext().getDeclAlign(VD);
1425	Size = Bld.CreateNUWAdd(
1426	Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1427	llvm::Value *AlignVal =
1428	llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1429
1430	Size = Bld.CreateUDiv(Size, AlignVal);
1431	Size = Bld.CreateNUWMul(Size, AlignVal);
	Value stored to 'Size' is never read
1432
1433	// Allocate space for this VLA object to be globalized.
1434	llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1435	llvm::Instruction *VoidPtr =
1436	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1437	CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1438	AllocArgs, VD->getName());
1439
1440	I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
1441	std::pair<llvm::Value , llvm::Value >(
1442	{VoidPtr, CGF.getTypeSize(VD->getType())}));
1443	LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
1444	CGM.getContext().getDeclAlign(VD),
1445	AlignmentSource::Decl);
1446	I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1447	Base.getAddress(CGF));
1448	}
1449	I->getSecond().MappedParams->apply(CGF);
1450	}
1451
1452	void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
1453	bool WithSPMDCheck) {
1454	if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1455	getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1456	return;
1457
1458	const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1459	if (I != FunctionGlobalizedDecls.end()) {
1460	// Deallocate the memory for each globalized VLA object
1461	for (auto AddrSizePair :
1462	llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1463	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1464	CGM.getModule(), OMPRTL___kmpc_free_shared),
1465	{AddrSizePair.first, AddrSizePair.second});
1466	}
1467	// Deallocate the memory for each globalized value
1468	for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1469	const auto *VD = cast<VarDecl>(Rec.first);
1470	I->getSecond().MappedParams->restore(CGF);
1471
1472	llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1473	CGF.getTypeSize(VD->getType())};
1474	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1475	CGM.getModule(), OMPRTL___kmpc_free_shared),
1476	FreeArgs);
1477	}
1478	}
1479	}
1480
1481	void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1482	const OMPExecutableDirective &D,
1483	SourceLocation Loc,
1484	llvm::Function *OutlinedFn,
1485	ArrayRef<llvm::Value *> CapturedVars) {
1486	if (!CGF.HaveInsertPoint())
1487	return;
1488
1489	Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1490	/Name=/".zero.addr");
1491	CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/C/ 0));
1492	llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1493	OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1494	OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1495	OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1496	emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1497	}
1498
1499	void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1500	SourceLocation Loc,
1501	llvm::Function *OutlinedFn,
1502	ArrayRef<llvm::Value *> CapturedVars,
1503	const Expr *IfCond) {
1504	if (!CGF.HaveInsertPoint())
1505	return;
1506
1507	auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars,
1508	IfCond](CodeGenFunction &CGF, PrePostActionTy &Action) {
1509	CGBuilderTy &Bld = CGF.Builder;
1510	llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1511	llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1512	if (WFn)
1513	ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1514	llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1515
1516	// Create a private scope that will globalize the arguments
1517	// passed from the outside of the target region.
1518	// TODO: Is that needed?
1519	CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1520
1521	Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1522	llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1523	"captured_vars_addrs");
1524	// There's something to share.
1525	if (!CapturedVars.empty()) {
1526	// Prepare for parallel region. Indicate the outlined function.
1527	ASTContext &Ctx = CGF.getContext();
1528	unsigned Idx = 0;
1529	for (llvm::Value *V : CapturedVars) {
1530	Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1531	llvm::Value *PtrV;
1532	if (V->getType()->isIntegerTy())
1533	PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1534	else
1535	PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1536	CGF.EmitStoreOfScalar(PtrV, Dst, /Volatile=/false,
1537	Ctx.getPointerType(Ctx.VoidPtrTy));
1538	++Idx;
1539	}
1540	}
1541
1542	llvm::Value *IfCondVal = nullptr;
1543	if (IfCond)
1544	IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1545	/* isSigned */ false);
1546	else
1547	IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1548
1549	assert(IfCondVal && "Expected a value")((void)0);
1550	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1551	llvm::Value *Args[] = {
1552	RTLoc,
1553	getThreadID(CGF, Loc),
1554	IfCondVal,
1555	llvm::ConstantInt::get(CGF.Int32Ty, -1),
1556	llvm::ConstantInt::get(CGF.Int32Ty, -1),
1557	FnPtr,
1558	ID,
1559	Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
1560	CGF.VoidPtrPtrTy),
1561	llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1562	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1563	CGM.getModule(), OMPRTL___kmpc_parallel_51),
1564	Args);
1565	};
1566
1567	RegionCodeGenTy RCG(ParallelGen);
1568	RCG(CGF);
1569	}
1570
1571	void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1572	// Always emit simple barriers!
1573	if (!CGF.HaveInsertPoint())
1574	return;
1575	// Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1576	// This function does not use parameters, so we can emit just default values.
1577	llvm::Value *Args[] = {
1578	llvm::ConstantPointerNull::get(
1579	cast<llvm::PointerType>(getIdentTyPointerTy())),
1580	llvm::ConstantInt::get(CGF.Int32Ty, /V=/0, /isSigned=/true)};
1581	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1582	CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1583	Args);
1584	}
1585
1586	void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1587	SourceLocation Loc,
1588	OpenMPDirectiveKind Kind, bool,
1589	bool) {
1590	// Always emit simple barriers!
1591	if (!CGF.HaveInsertPoint())
1592	return;
1593	// Build call __kmpc_cancel_barrier(loc, thread_id);
1594	unsigned Flags = getDefaultFlagsForBarriers(Kind);
1595	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1596	getThreadID(CGF, Loc)};
1597
1598	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1599	CGM.getModule(), OMPRTL___kmpc_barrier),
1600	Args);
1601	}
1602
1603	void CGOpenMPRuntimeGPU::emitCriticalRegion(
1604	CodeGenFunction &CGF, StringRef CriticalName,
1605	const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1606	const Expr *Hint) {
1607	llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1608	llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1609	llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1610	llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1611	llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1612
1613	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1614
1615	// Get the mask of active threads in the warp.
1616	llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1617	CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1618	// Fetch team-local id of the thread.
1619	llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1620
1621	// Get the width of the team.
1622	llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1623
1624	// Initialize the counter variable for the loop.
1625	QualType Int32Ty =
1626	CGF.getContext().getIntTypeForBitwidth(/DestWidth=/32, /Signed=/0);
1627	Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1628	LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1629	CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1630	/isInit=/true);
1631
1632	// Block checks if loop counter exceeds upper bound.
1633	CGF.EmitBlock(LoopBB);
1634	llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1635	llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1636	CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1637
1638	// Block tests which single thread should execute region, and which threads
1639	// should go straight to synchronisation point.
1640	CGF.EmitBlock(TestBB);
1641	CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1642	llvm::Value *CmpThreadToCounter =
1643	CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1644	CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1645
1646	// Block emits the body of the critical region.
1647	CGF.EmitBlock(BodyBB);
1648
1649	// Output the critical statement.
1650	CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1651	Hint);
1652
1653	// After the body surrounded by the critical region, the single executing
1654	// thread will jump to the synchronisation point.
1655	// Block waits for all threads in current team to finish then increments the
1656	// counter variable and returns to the loop.
1657	CGF.EmitBlock(SyncBB);
1658	// Reconverge active threads in the warp.
1659	(void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1660	CGM.getModule(), OMPRTL___kmpc_syncwarp),
1661	Mask);
1662
1663	llvm::Value *IncCounterVal =
1664	CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1665	CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1666	CGF.EmitBranch(LoopBB);
1667
1668	// Block that is reached when all threads in the team complete the region.
1669	CGF.EmitBlock(ExitBB, /IsFinished=/true);
1670	}
1671
1672	/// Cast value to the specified type.
1673	static llvm::Value castValueToType(CodeGenFunction &CGF, llvm::Value Val,
1674	QualType ValTy, QualType CastTy,
1675	SourceLocation Loc) {
1676	assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&((void)0)
1677	"Cast type must sized.")((void)0);
1678	assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&((void)0)
1679	"Val type must sized.")((void)0);
1680	llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1681	if (ValTy == CastTy)
1682	return Val;
1683	if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1684	CGF.getContext().getTypeSizeInChars(CastTy))
1685	return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1686	if (CastTy->isIntegerType() && ValTy->isIntegerType())
1687	return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1688	CastTy->hasSignedIntegerRepresentation());
1689	Address CastItem = CGF.CreateMemTemp(CastTy);
1690	Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1691	CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
1692	CGF.EmitStoreOfScalar(Val, ValCastItem, /Volatile=/false, ValTy,
1693	LValueBaseInfo(AlignmentSource::Type),
1694	TBAAAccessInfo());
1695	return CGF.EmitLoadOfScalar(CastItem, /Volatile=/false, CastTy, Loc,
1696	LValueBaseInfo(AlignmentSource::Type),
1697	TBAAAccessInfo());
1698	}
1699
1700	/// This function creates calls to one of two shuffle functions to copy
1701	/// variables between lanes in a warp.
1702	static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
1703	llvm::Value *Elem,
1704	QualType ElemType,
1705	llvm::Value *Offset,
1706	SourceLocation Loc) {
1707	CodeGenModule &CGM = CGF.CGM;
1708	CGBuilderTy &Bld = CGF.Builder;
1709	CGOpenMPRuntimeGPU &RT =
1710	(static_cast<CGOpenMPRuntimeGPU >(&CGM.getOpenMPRuntime()));
1711	llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1712
1713	CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1714	assert(Size.getQuantity() <= 8 &&((void)0)
1715	"Unsupported bitwidth in shuffle instruction.")((void)0);
1716
1717	RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1718	? OMPRTL___kmpc_shuffle_int32
1719	: OMPRTL___kmpc_shuffle_int64;
1720
1721	// Cast all types to 32- or 64-bit values before calling shuffle routines.
1722	QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1723	Size.getQuantity() <= 4 ? 32 : 64, /Signed=/1);
1724	llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
1725	llvm::Value *WarpSize =
1726	Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /isSigned=/true);
1727
1728	llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1729	OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
1730	{ElemCast, Offset, WarpSize});
1731
1732	return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
1733	}
1734
1735	static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1736	Address DestAddr, QualType ElemType,
1737	llvm::Value *Offset, SourceLocation Loc) {
1738	CGBuilderTy &Bld = CGF.Builder;
1739
1740	CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1741	// Create the loop over the big sized data.
1742	// ptr = (void*)Elem;
1743	// ptrEnd = (void*) Elem + 1;
1744	// Step = 8;
1745	// while (ptr + Step < ptrEnd)
1746	// shuffle((int64_t)*ptr);
1747	// Step = 4;
1748	// while (ptr + Step < ptrEnd)
1749	// shuffle((int32_t)*ptr);
1750	// ...
1751	Address ElemPtr = DestAddr;
1752	Address Ptr = SrcAddr;
1753	Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
1754	Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
1755	for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1756	if (Size < CharUnits::fromQuantity(IntSize))
1757	continue;
1758	QualType IntType = CGF.getContext().getIntTypeForBitwidth(
1759	CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
1760	/Signed=/1);
1761	llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
1762	Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
1763	ElemPtr =
1764	Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
1765	if (Size.getQuantity() / IntSize > 1) {
1766	llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
1767	llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
1768	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
1769	llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1770	CGF.EmitBlock(PreCondBB);
1771	llvm::PHINode *PhiSrc =
1772	Bld.CreatePHI(Ptr.getType(), /NumReservedValues=/2);
1773	PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
1774	llvm::PHINode *PhiDest =
1775	Bld.CreatePHI(ElemPtr.getType(), /NumReservedValues=/2);
1776	PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
1777	Ptr = Address(PhiSrc, Ptr.getAlignment());
1778	ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
1779	llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1780	PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
1781	Ptr.getPointer(), CGF.VoidPtrTy));
1782	Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
1783	ThenBB, ExitBB);
1784	CGF.EmitBlock(ThenBB);
1785	llvm::Value *Res = createRuntimeShuffleFunction(
1786	CGF,
1787	CGF.EmitLoadOfScalar(Ptr, /Volatile=/false, IntType, Loc,
1788	LValueBaseInfo(AlignmentSource::Type),
1789	TBAAAccessInfo()),
1790	IntType, Offset, Loc);
1791	CGF.EmitStoreOfScalar(Res, ElemPtr, /Volatile=/false, IntType,
1792	LValueBaseInfo(AlignmentSource::Type),
1793	TBAAAccessInfo());
1794	Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
1795	Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1796	PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
1797	PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
1798	CGF.EmitBranch(PreCondBB);
1799	CGF.EmitBlock(ExitBB);
1800	} else {
1801	llvm::Value *Res = createRuntimeShuffleFunction(
1802	CGF,
1803	CGF.EmitLoadOfScalar(Ptr, /Volatile=/false, IntType, Loc,
1804	LValueBaseInfo(AlignmentSource::Type),
1805	TBAAAccessInfo()),
1806	IntType, Offset, Loc);
1807	CGF.EmitStoreOfScalar(Res, ElemPtr, /Volatile=/false, IntType,
1808	LValueBaseInfo(AlignmentSource::Type),
1809	TBAAAccessInfo());
1810	Ptr = Bld.CreateConstGEP(Ptr, 1);
1811	ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1812	}
1813	Size = Size % IntSize;
1814	}
1815	}
1816
1817	namespace {
1818	enum CopyAction : unsigned {
1819	// RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1820	// the warp using shuffle instructions.
1821	RemoteLaneToThread,
1822	// ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1823	ThreadCopy,
1824	// ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1825	ThreadToScratchpad,
1826	// ScratchpadToThread: Copy from a scratchpad array in global memory
1827	// containing team-reduced data to a thread's stack.
1828	ScratchpadToThread,
1829	};
1830	} // namespace
1831
1832	struct CopyOptionsTy {
1833	llvm::Value *RemoteLaneOffset;
1834	llvm::Value *ScratchpadIndex;
1835	llvm::Value *ScratchpadWidth;
1836	};
1837
1838	/// Emit instructions to copy a Reduce list, which contains partially
1839	/// aggregated values, in the specified direction.
1840	static void emitReductionListCopy(
1841	CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1842	ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1843	CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
1844
1845	CodeGenModule &CGM = CGF.CGM;
1846	ASTContext &C = CGM.getContext();
1847	CGBuilderTy &Bld = CGF.Builder;
1848
1849	llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1850	llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1851	llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1852
1853	// Iterates, element-by-element, through the source Reduce list and
1854	// make a copy.
1855	unsigned Idx = 0;
1856	unsigned Size = Privates.size();
1857	for (const Expr *Private : Privates) {
1858	Address SrcElementAddr = Address::invalid();
1859	Address DestElementAddr = Address::invalid();
1860	Address DestElementPtrAddr = Address::invalid();
1861	// Should we shuffle in an element from a remote lane?
1862	bool ShuffleInElement = false;
1863	// Set to true to update the pointer in the dest Reduce list to a
1864	// newly created element.
1865	bool UpdateDestListPtr = false;
1866	// Increment the src or dest pointer to the scratchpad, for each
1867	// new element.
1868	bool IncrScratchpadSrc = false;
1869	bool IncrScratchpadDest = false;
1870
1871	switch (Action) {
1872	case RemoteLaneToThread: {
1873	// Step 1.1: Get the address for the src element in the Reduce list.
1874	Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1875	SrcElementAddr = CGF.EmitLoadOfPointer(
1876	SrcElementPtrAddr,
1877	C.getPointerType(Private->getType())->castAs<PointerType>());
1878
1879	// Step 1.2: Create a temporary to store the element in the destination
1880	// Reduce list.
1881	DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1882	DestElementAddr =
1883	CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1884	ShuffleInElement = true;
1885	UpdateDestListPtr = true;
1886	break;
1887	}
1888	case ThreadCopy: {
1889	// Step 1.1: Get the address for the src element in the Reduce list.
1890	Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1891	SrcElementAddr = CGF.EmitLoadOfPointer(
1892	SrcElementPtrAddr,
1893	C.getPointerType(Private->getType())->castAs<PointerType>());
1894
1895	// Step 1.2: Get the address for dest element. The destination
1896	// element has already been created on the thread's stack.
1897	DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1898	DestElementAddr = CGF.EmitLoadOfPointer(
1899	DestElementPtrAddr,
1900	C.getPointerType(Private->getType())->castAs<PointerType>());
1901	break;
1902	}
1903	case ThreadToScratchpad: {
1904	// Step 1.1: Get the address for the src element in the Reduce list.
1905	Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1906	SrcElementAddr = CGF.EmitLoadOfPointer(
1907	SrcElementPtrAddr,
1908	C.getPointerType(Private->getType())->castAs<PointerType>());
1909
1910	// Step 1.2: Get the address for dest element:
1911	// address = base + index * ElementSizeInChars.
1912	llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1913	llvm::Value *CurrentOffset =
1914	Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1915	llvm::Value *ScratchPadElemAbsolutePtrVal =
1916	Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
1917	ScratchPadElemAbsolutePtrVal =
1918	Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1919	DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1920	C.getTypeAlignInChars(Private->getType()));
1921	IncrScratchpadDest = true;
1922	break;
1923	}
1924	case ScratchpadToThread: {
1925	// Step 1.1: Get the address for the src element in the scratchpad.
1926	// address = base + index * ElementSizeInChars.
1927	llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1928	llvm::Value *CurrentOffset =
1929	Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1930	llvm::Value *ScratchPadElemAbsolutePtrVal =
1931	Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
1932	ScratchPadElemAbsolutePtrVal =
1933	Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1934	SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1935	C.getTypeAlignInChars(Private->getType()));
1936	IncrScratchpadSrc = true;
1937
1938	// Step 1.2: Create a temporary to store the element in the destination
1939	// Reduce list.
1940	DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1941	DestElementAddr =
1942	CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1943	UpdateDestListPtr = true;
1944	break;
1945	}
1946	}
1947
1948	// Regardless of src and dest of copy, we emit the load of src
1949	// element as this is required in all directions
1950	SrcElementAddr = Bld.CreateElementBitCast(
1951	SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1952	DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
1953	SrcElementAddr.getElementType());
1954
1955	// Now that all active lanes have read the element in the
1956	// Reduce list, shuffle over the value from the remote lane.
1957	if (ShuffleInElement) {
1958	shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
1959	RemoteLaneOffset, Private->getExprLoc());
1960	} else {
1961	switch (CGF.getEvaluationKind(Private->getType())) {
1962	case TEK_Scalar: {
1963	llvm::Value *Elem = CGF.EmitLoadOfScalar(
1964	SrcElementAddr, /Volatile=/false, Private->getType(),
1965	Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
1966	TBAAAccessInfo());
1967	// Store the source element value to the dest element address.
1968	CGF.EmitStoreOfScalar(
1969	Elem, DestElementAddr, /Volatile=/false, Private->getType(),
1970	LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
1971	break;
1972	}
1973	case TEK_Complex: {
1974	CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
1975	CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1976	Private->getExprLoc());
1977	CGF.EmitStoreOfComplex(
1978	Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1979	/isInit=/false);
1980	break;
1981	}
1982	case TEK_Aggregate:
1983	CGF.EmitAggregateCopy(
1984	CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1985	CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1986	Private->getType(), AggValueSlot::DoesNotOverlap);
1987	break;
1988	}
1989	}
1990
1991	// Step 3.1: Modify reference in dest Reduce list as needed.
1992	// Modifying the reference in Reduce list to point to the newly
1993	// created element. The element is live in the current function
1994	// scope and that of functions it invokes (i.e., reduce_function).
1995	// RemoteReduceData[i] = (void*)&RemoteElem
1996	if (UpdateDestListPtr) {
1997	CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
1998	DestElementAddr.getPointer(), CGF.VoidPtrTy),
1999	DestElementPtrAddr, /Volatile=/false,
2000	C.VoidPtrTy);
2001	}
2002
2003	// Step 4.1: Increment SrcBase/DestBase so that it points to the starting
2004	// address of the next element in scratchpad memory, unless we're currently
2005	// processing the last one. Memory alignment is also taken care of here.
2006	if ((IncrScratchpadDest \|\| IncrScratchpadSrc) && (Idx + 1 < Size)) {
2007	llvm::Value *ScratchpadBasePtr =
2008	IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
2009	llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2010	ScratchpadBasePtr = Bld.CreateNUWAdd(
2011	ScratchpadBasePtr,
2012	Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
2013
2014	// Take care of global memory alignment for performance
2015	ScratchpadBasePtr = Bld.CreateNUWSub(
2016	ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2017	ScratchpadBasePtr = Bld.CreateUDiv(
2018	ScratchpadBasePtr,
2019	llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2020	ScratchpadBasePtr = Bld.CreateNUWAdd(
2021	ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2022	ScratchpadBasePtr = Bld.CreateNUWMul(
2023	ScratchpadBasePtr,
2024	llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2025
2026	if (IncrScratchpadDest)
2027	DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2028	else /* IncrScratchpadSrc = true */
2029	SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2030	}
2031
2032	++Idx;
2033	}
2034	}
2035
2036	/// This function emits a helper that gathers Reduce lists from the first
2037	/// lane of every active warp to lanes in the first warp.
2038	///
2039	/// void inter_warp_copy_func(void* reduce_data, num_warps)
2040	/// shared smem[warp_size];
2041	/// For all data entries D in reduce_data:
2042	/// sync
2043	/// If (I am the first lane in each warp)
2044	/// Copy my local D to smem[warp_id]
2045	/// sync
2046	/// if (I am the first warp)
2047	/// Copy smem[thread_id] to my local D
2048	static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
2049	ArrayRef<const Expr *> Privates,
2050	QualType ReductionArrayTy,
2051	SourceLocation Loc) {
2052	ASTContext &C = CGM.getContext();
2053	llvm::Module &M = CGM.getModule();
2054
2055	// ReduceList: thread local Reduce list.
2056	// At the stage of the computation when this function is called, partially
2057	// aggregated values reside in the first lane of every active warp.
2058	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2059	C.VoidPtrTy, ImplicitParamDecl::Other);
2060	// NumWarps: number of warps active in the parallel region. This could
2061	// be smaller than 32 (max warps in a CTA) for partial block reduction.
2062	ImplicitParamDecl NumWarpsArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2063	C.getIntTypeForBitwidth(32, /* Signed */ true),
2064	ImplicitParamDecl::Other);
2065	FunctionArgList Args;
2066	Args.push_back(&ReduceListArg);
2067	Args.push_back(&NumWarpsArg);
2068
2069	const CGFunctionInfo &CGFI =
2070	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2071	auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
2072	llvm::GlobalValue::InternalLinkage,
2073	"_omp_reduction_inter_warp_copy_func", &M);
2074	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2075	Fn->setDoesNotRecurse();
2076	CodeGenFunction CGF(CGM);
2077	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2078
2079	CGBuilderTy &Bld = CGF.Builder;
2080
2081	// This array is used as a medium to transfer, one reduce element at a time,
2082	// the data from the first lane of every warp to lanes in the first warp
2083	// in order to perform the final step of a reduction in a parallel region
2084	// (reduction across warps). The array is placed in NVPTX __shared__ memory
2085	// for reduced latency, as well as to have a distinct copy for concurrently
2086	// executing target regions. The array is declared with common linkage so
2087	// as to be shared across compilation units.
2088	StringRef TransferMediumName =
2089	"__openmp_nvptx_data_transfer_temporary_storage";
2090	llvm::GlobalVariable *TransferMedium =
2091	M.getGlobalVariable(TransferMediumName);
2092	unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
2093	if (!TransferMedium) {
2094	auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
2095	unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
2096	TransferMedium = new llvm::GlobalVariable(
2097	M, Ty, /isConstant=/false, llvm::GlobalVariable::WeakAnyLinkage,
2098	llvm::UndefValue::get(Ty), TransferMediumName,
2099	/InsertBefore=/nullptr, llvm::GlobalVariable::NotThreadLocal,
2100	SharedAddressSpace);
2101	CGM.addCompilerUsedGlobal(TransferMedium);
2102	}
2103
2104	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2105	// Get the CUDA thread id of the current OpenMP thread on the GPU.
2106	llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
2107	// nvptx_lane_id = nvptx_id % warpsize
2108	llvm::Value *LaneID = getNVPTXLaneID(CGF);
2109	// nvptx_warp_id = nvptx_id / warpsize
2110	llvm::Value *WarpID = getNVPTXWarpID(CGF);
2111
2112	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2113	Address LocalReduceList(
2114	Bld.CreatePointerBitCastOrAddrSpaceCast(
2115	CGF.EmitLoadOfScalar(
2116	AddrReduceListArg, /Volatile=/false, C.VoidPtrTy, Loc,
2117	LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
2118	CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2119	CGF.getPointerAlign());
2120
2121	unsigned Idx = 0;
2122	for (const Expr *Private : Privates) {
2123	//
2124	// Warp master copies reduce element to transfer medium in __shared__
2125	// memory.
2126	//
2127	unsigned RealTySize =
2128	C.getTypeSizeInChars(Private->getType())
2129	.alignTo(C.getTypeAlignInChars(Private->getType()))
2130	.getQuantity();
2131	for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
2132	unsigned NumIters = RealTySize / TySize;
2133	if (NumIters == 0)
2134	continue;
2135	QualType CType = C.getIntTypeForBitwidth(
2136	C.toBits(CharUnits::fromQuantity(TySize)), /Signed=/1);
2137	llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
2138	CharUnits Align = CharUnits::fromQuantity(TySize);
2139	llvm::Value *Cnt = nullptr;
2140	Address CntAddr = Address::invalid();
2141	llvm::BasicBlock *PrecondBB = nullptr;
2142	llvm::BasicBlock *ExitBB = nullptr;
2143	if (NumIters > 1) {
2144	CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
2145	CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
2146	/Volatile=/false, C.IntTy);
2147	PrecondBB = CGF.createBasicBlock("precond");
2148	ExitBB = CGF.createBasicBlock("exit");
2149	llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
2150	// There is no need to emit line number for unconditional branch.
2151	(void)ApplyDebugLocation::CreateEmpty(CGF);
2152	CGF.EmitBlock(PrecondBB);
2153	Cnt = CGF.EmitLoadOfScalar(CntAddr, /Volatile=/false, C.IntTy, Loc);
2154	llvm::Value *Cmp =
2155	Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
2156	Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
2157	CGF.EmitBlock(BodyBB);
2158	}
2159	// kmpc_barrier.
2160	CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2161	/EmitChecks=/false,
2162	/ForceSimpleCall=/true);
2163	llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2164	llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2165	llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2166
2167	// if (lane_id == 0)
2168	llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
2169	Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2170	CGF.EmitBlock(ThenBB);
2171
2172	// Reduce element = LocalReduceList[i]
2173	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2174	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2175	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation());
2176	// elemptr = ((CopyType*)(elemptrptr)) + I
2177	Address ElemPtr = Address(ElemPtrPtr, Align);
2178	ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
2179	if (NumIters > 1) {
2180	ElemPtr = Address(Bld.CreateGEP(ElemPtr.getElementType(),
2181	ElemPtr.getPointer(), Cnt),
2182	ElemPtr.getAlignment());
2183	}
2184
2185	// Get pointer to location in transfer medium.
2186	// MediumPtr = &medium[warp_id]
2187	llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
2188	TransferMedium->getValueType(), TransferMedium,
2189	{llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
2190	Address MediumPtr(MediumPtrVal, Align);
2191	// Casting to actual data type.
2192	// MediumPtr = (CopyType*)MediumPtrAddr;
2193	MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
2194
2195	// elem = *elemptr
2196	//*MediumPtr = elem
2197	llvm::Value *Elem = CGF.EmitLoadOfScalar(
2198	ElemPtr, /Volatile=/false, CType, Loc,
2199	LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2200	// Store the source element value to the dest element address.
2201	CGF.EmitStoreOfScalar(Elem, MediumPtr, /Volatile=/true, CType,
2202	LValueBaseInfo(AlignmentSource::Type),
2203	TBAAAccessInfo());
2204
2205	Bld.CreateBr(MergeBB);
2206
2207	CGF.EmitBlock(ElseBB);
2208	Bld.CreateBr(MergeBB);
2209
2210	CGF.EmitBlock(MergeBB);
2211
2212	// kmpc_barrier.
2213	CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2214	/EmitChecks=/false,
2215	/ForceSimpleCall=/true);
2216
2217	//
2218	// Warp 0 copies reduce element from transfer medium.
2219	//
2220	llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
2221	llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
2222	llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
2223
2224	Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
2225	llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
2226	AddrNumWarpsArg, /Volatile=/false, C.IntTy, Loc);
2227
2228	// Up to 32 threads in warp 0 are active.
2229	llvm::Value *IsActiveThread =
2230	Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
2231	Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2232
2233	CGF.EmitBlock(W0ThenBB);
2234
2235	// SrcMediumPtr = &medium[tid]
2236	llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2237	TransferMedium->getValueType(), TransferMedium,
2238	{llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2239	Address SrcMediumPtr(SrcMediumPtrVal, Align);
2240	// SrcMediumVal = *SrcMediumPtr;
2241	SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
2242
2243	// TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2244	Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2245	llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
2246	TargetElemPtrPtr, /Volatile=/false, C.VoidPtrTy, Loc);
2247	Address TargetElemPtr = Address(TargetElemPtrVal, Align);
2248	TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
2249	if (NumIters > 1) {
2250	TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getElementType(),
2251	TargetElemPtr.getPointer(), Cnt),
2252	TargetElemPtr.getAlignment());
2253	}
2254
2255	// *TargetElemPtr = SrcMediumVal;
2256	llvm::Value *SrcMediumValue =
2257	CGF.EmitLoadOfScalar(SrcMediumPtr, /Volatile=/true, CType, Loc);
2258	CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /Volatile=/false,
2259	CType);
2260	Bld.CreateBr(W0MergeBB);
2261
2262	CGF.EmitBlock(W0ElseBB);
2263	Bld.CreateBr(W0MergeBB);
2264
2265	CGF.EmitBlock(W0MergeBB);
2266
2267	if (NumIters > 1) {
2268	Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /V=/1));
2269	CGF.EmitStoreOfScalar(Cnt, CntAddr, /Volatile=/false, C.IntTy);
2270	CGF.EmitBranch(PrecondBB);
2271	(void)ApplyDebugLocation::CreateEmpty(CGF);
2272	CGF.EmitBlock(ExitBB);
2273	}
2274	RealTySize %= TySize;
2275	}
2276	++Idx;
2277	}
2278
2279	CGF.FinishFunction();
2280	return Fn;
2281	}
2282
2283	/// Emit a helper that reduces data across two OpenMP threads (lanes)
2284	/// in the same warp. It uses shuffle instructions to copy over data from
2285	/// a remote lane's stack. The reduction algorithm performed is specified
2286	/// by the fourth parameter.
2287	///
2288	/// Algorithm Versions.
2289	/// Full Warp Reduce (argument value 0):
2290	/// This algorithm assumes that all 32 lanes are active and gathers
2291	/// data from these 32 lanes, producing a single resultant value.
2292	/// Contiguous Partial Warp Reduce (argument value 1):
2293	/// This algorithm assumes that only a contiguous subset of lanes
2294	/// are active. This happens for the last warp in a parallel region
2295	/// when the user specified num_threads is not an integer multiple of
2296	/// 32. This contiguous subset always starts with the zeroth lane.
2297	/// Partial Warp Reduce (argument value 2):
2298	/// This algorithm gathers data from any number of lanes at any position.
2299	/// All reduced values are stored in the lowest possible lane. The set
2300	/// of problems every algorithm addresses is a super set of those
2301	/// addressable by algorithms with a lower version number. Overhead
2302	/// increases as algorithm version increases.
2303	///
2304	/// Terminology
2305	/// Reduce element:
2306	/// Reduce element refers to the individual data field with primitive
2307	/// data types to be combined and reduced across threads.
2308	/// Reduce list:
2309	/// Reduce list refers to a collection of local, thread-private
2310	/// reduce elements.
2311	/// Remote Reduce list:
2312	/// Remote Reduce list refers to a collection of remote (relative to
2313	/// the current thread) reduce elements.
2314	///
2315	/// We distinguish between three states of threads that are important to
2316	/// the implementation of this function.
2317	/// Alive threads:
2318	/// Threads in a warp executing the SIMT instruction, as distinguished from
2319	/// threads that are inactive due to divergent control flow.
2320	/// Active threads:
2321	/// The minimal set of threads that has to be alive upon entry to this
2322	/// function. The computation is correct iff active threads are alive.
2323	/// Some threads are alive but they are not active because they do not
2324	/// contribute to the computation in any useful manner. Turning them off
2325	/// may introduce control flow overheads without any tangible benefits.
2326	/// Effective threads:
2327	/// In order to comply with the argument requirements of the shuffle
2328	/// function, we must keep all lanes holding data alive. But at most
2329	/// half of them perform value aggregation; we refer to this half of
2330	/// threads as effective. The other half is simply handing off their
2331	/// data.
2332	///
2333	/// Procedure
2334	/// Value shuffle:
2335	/// In this step active threads transfer data from higher lane positions
2336	/// in the warp to lower lane positions, creating Remote Reduce list.
2337	/// Value aggregation:
2338	/// In this step, effective threads combine their thread local Reduce list
2339	/// with Remote Reduce list and store the result in the thread local
2340	/// Reduce list.
2341	/// Value copy:
2342	/// In this step, we deal with the assumption made by algorithm 2
2343	/// (i.e. contiguity assumption). When we have an odd number of lanes
2344	/// active, say 2k+1, only k threads will be effective and therefore k
2345	/// new values will be produced. However, the Reduce list owned by the
2346	/// (2k+1)th thread is ignored in the value aggregation. Therefore
2347	/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2348	/// that the contiguity assumption still holds.
2349	static llvm::Function *emitShuffleAndReduceFunction(
2350	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2351	QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2352	ASTContext &C = CGM.getContext();
2353
2354	// Thread local Reduce list used to host the values of data to be reduced.
2355	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2356	C.VoidPtrTy, ImplicitParamDecl::Other);
2357	// Current lane id; could be logical.
2358	ImplicitParamDecl LaneIDArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.ShortTy,
2359	ImplicitParamDecl::Other);
2360	// Offset of the remote source lane relative to the current lane.
2361	ImplicitParamDecl RemoteLaneOffsetArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2362	C.ShortTy, ImplicitParamDecl::Other);
2363	// Algorithm version. This is expected to be known at compile time.
2364	ImplicitParamDecl AlgoVerArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2365	C.ShortTy, ImplicitParamDecl::Other);
2366	FunctionArgList Args;
2367	Args.push_back(&ReduceListArg);
2368	Args.push_back(&LaneIDArg);
2369	Args.push_back(&RemoteLaneOffsetArg);
2370	Args.push_back(&AlgoVerArg);
2371
2372	const CGFunctionInfo &CGFI =
2373	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2374	auto *Fn = llvm::Function::Create(
2375	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2376	"_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2377	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2378	Fn->setDoesNotRecurse();
2379
2380	CodeGenFunction CGF(CGM);
2381	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2382
2383	CGBuilderTy &Bld = CGF.Builder;
2384
2385	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2386	Address LocalReduceList(
2387	Bld.CreatePointerBitCastOrAddrSpaceCast(
2388	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
2389	C.VoidPtrTy, SourceLocation()),
2390	CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2391	CGF.getPointerAlign());
2392
2393	Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2394	llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2395	AddrLaneIDArg, /Volatile=/false, C.ShortTy, SourceLocation());
2396
2397	Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2398	llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2399	AddrRemoteLaneOffsetArg, /Volatile=/false, C.ShortTy, SourceLocation());
2400
2401	Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2402	llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2403	AddrAlgoVerArg, /Volatile=/false, C.ShortTy, SourceLocation());
2404
2405	// Create a local thread-private variable to host the Reduce list
2406	// from a remote lane.
2407	Address RemoteReduceList =
2408	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2409
2410	// This loop iterates through the list of reduce elements and copies,
2411	// element by element, from a remote lane in the warp to RemoteReduceList,
2412	// hosted on the thread's stack.
2413	emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2414	LocalReduceList, RemoteReduceList,
2415	{/RemoteLaneOffset=/RemoteLaneOffsetArgVal,
2416	/ScratchpadIndex=/nullptr,
2417	/ScratchpadWidth=/nullptr});
2418
2419	// The actions to be performed on the Remote Reduce list is dependent
2420	// on the algorithm version.
2421	//
2422	// if (AlgoVer==0) \|\| (AlgoVer==1 && (LaneId < Offset)) \|\| (AlgoVer==2 &&
2423	// LaneId % 2 == 0 && Offset > 0):
2424	// do the reduction value aggregation
2425	//
2426	// The thread local variable Reduce list is mutated in place to host the
2427	// reduced data, which is the aggregated value produced from local and
2428	// remote lanes.
2429	//
2430	// Note that AlgoVer is expected to be a constant integer known at compile
2431	// time.
2432	// When AlgoVer==0, the first conjunction evaluates to true, making
2433	// the entire predicate true during compile time.
2434	// When AlgoVer==1, the second conjunction has only the second part to be
2435	// evaluated during runtime. Other conjunctions evaluates to false
2436	// during compile time.
2437	// When AlgoVer==2, the third conjunction has only the second part to be
2438	// evaluated during runtime. Other conjunctions evaluates to false
2439	// during compile time.
2440	llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2441
2442	llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2443	llvm::Value *CondAlgo1 = Bld.CreateAnd(
2444	Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2445
2446	llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2447	llvm::Value *CondAlgo2 = Bld.CreateAnd(
2448	Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2449	CondAlgo2 = Bld.CreateAnd(
2450	CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2451
2452	llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2453	CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2454
2455	llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2456	llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2457	llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2458	Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2459
2460	CGF.EmitBlock(ThenBB);
2461	// reduce_function(LocalReduceList, RemoteReduceList)
2462	llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2463	LocalReduceList.getPointer(), CGF.VoidPtrTy);
2464	llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2465	RemoteReduceList.getPointer(), CGF.VoidPtrTy);
2466	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2467	CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2468	Bld.CreateBr(MergeBB);
2469
2470	CGF.EmitBlock(ElseBB);
2471	Bld.CreateBr(MergeBB);
2472
2473	CGF.EmitBlock(MergeBB);
2474
2475	// if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2476	// Reduce list.
2477	Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2478	llvm::Value *CondCopy = Bld.CreateAnd(
2479	Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2480
2481	llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2482	llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2483	llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2484	Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2485
2486	CGF.EmitBlock(CpyThenBB);
2487	emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2488	RemoteReduceList, LocalReduceList);
2489	Bld.CreateBr(CpyMergeBB);
2490
2491	CGF.EmitBlock(CpyElseBB);
2492	Bld.CreateBr(CpyMergeBB);
2493
2494	CGF.EmitBlock(CpyMergeBB);
2495
2496	CGF.FinishFunction();
2497	return Fn;
2498	}
2499
2500	/// This function emits a helper that copies all the reduction variables from
2501	/// the team into the provided global buffer for the reduction variables.
2502	///
2503	/// void list_to_global_copy_func(void buffer, int Idx, void reduce_data)
2504	/// For all data entries D in reduce_data:
2505	/// Copy local D to buffer.D[Idx]
2506	static llvm::Value *emitListToGlobalCopyFunction(
2507	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2508	QualType ReductionArrayTy, SourceLocation Loc,
2509	const RecordDecl *TeamReductionRec,
2510	const llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
2511	&VarFieldMap) {
2512	ASTContext &C = CGM.getContext();
2513
2514	// Buffer: global reduction buffer.
2515	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2516	C.VoidPtrTy, ImplicitParamDecl::Other);
2517	// Idx: index of the buffer.
2518	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
2519	ImplicitParamDecl::Other);
2520	// ReduceList: thread local Reduce list.
2521	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2522	C.VoidPtrTy, ImplicitParamDecl::Other);
2523	FunctionArgList Args;
2524	Args.push_back(&BufferArg);
2525	Args.push_back(&IdxArg);
2526	Args.push_back(&ReduceListArg);
2527
2528	const CGFunctionInfo &CGFI =
2529	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2530	auto *Fn = llvm::Function::Create(
2531	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2532	"_omp_reduction_list_to_global_copy_func", &CGM.getModule());
2533	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2534	Fn->setDoesNotRecurse();
2535	CodeGenFunction CGF(CGM);
2536	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2537
2538	CGBuilderTy &Bld = CGF.Builder;
2539
2540	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2541	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2542	Address LocalReduceList(
2543	Bld.CreatePointerBitCastOrAddrSpaceCast(
2544	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
2545	C.VoidPtrTy, Loc),
2546	CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2547	CGF.getPointerAlign());
2548	QualType StaticTy = C.getRecordType(TeamReductionRec);
2549	llvm::Type *LLVMReductionsBufferTy =
2550	CGM.getTypes().ConvertTypeForMem(StaticTy);
2551	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2552	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
2553	LLVMReductionsBufferTy->getPointerTo());
2554	llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2555	CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2556	/Volatile=/false, C.IntTy,
2557	Loc)};
2558	unsigned Idx = 0;
2559	for (const Expr *Private : Privates) {
2560	// Reduce element = LocalReduceList[i]
2561	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2562	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2563	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation());
2564	// elemptr = ((CopyType*)(elemptrptr)) + I
2565	ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2566	ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
2567	Address ElemPtr =
2568	Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
2569	const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2570	// Global = Buffer.VD[Idx];
2571	const FieldDecl *FD = VarFieldMap.lookup(VD);
2572	LValue GlobLVal = CGF.EmitLValueForField(
2573	CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2574	Address GlobAddr = GlobLVal.getAddress(CGF);
2575	llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2576	GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2577	GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
2578	switch (CGF.getEvaluationKind(Private->getType())) {
2579	case TEK_Scalar: {
2580	llvm::Value *V = CGF.EmitLoadOfScalar(
2581	ElemPtr, /Volatile=/false, Private->getType(), Loc,
2582	LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2583	CGF.EmitStoreOfScalar(V, GlobLVal);
2584	break;
2585	}
2586	case TEK_Complex: {
2587	CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
2588	CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
2589	CGF.EmitStoreOfComplex(V, GlobLVal, /isInit=/false);
2590	break;
2591	}
2592	case TEK_Aggregate:
2593	CGF.EmitAggregateCopy(GlobLVal,
2594	CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2595	Private->getType(), AggValueSlot::DoesNotOverlap);
2596	break;
2597	}
2598	++Idx;
2599	}
2600
2601	CGF.FinishFunction();
2602	return Fn;
2603	}
2604
2605	/// This function emits a helper that reduces all the reduction variables from
2606	/// the team into the provided global buffer for the reduction variables.
2607	///
2608	/// void list_to_global_reduce_func(void buffer, int Idx, void reduce_data)
2609	/// void *GlobPtrs[];
2610	/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2611	/// ...
2612	/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2613	/// reduce_function(GlobPtrs, reduce_data);
2614	static llvm::Value *emitListToGlobalReduceFunction(
2615	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2616	QualType ReductionArrayTy, SourceLocation Loc,
2617	const RecordDecl *TeamReductionRec,
2618	const llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
2619	&VarFieldMap,
2620	llvm::Function *ReduceFn) {
2621	ASTContext &C = CGM.getContext();
2622
2623	// Buffer: global reduction buffer.
2624	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2625	C.VoidPtrTy, ImplicitParamDecl::Other);
2626	// Idx: index of the buffer.
2627	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
2628	ImplicitParamDecl::Other);
2629	// ReduceList: thread local Reduce list.
2630	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2631	C.VoidPtrTy, ImplicitParamDecl::Other);
2632	FunctionArgList Args;
2633	Args.push_back(&BufferArg);
2634	Args.push_back(&IdxArg);
2635	Args.push_back(&ReduceListArg);
2636
2637	const CGFunctionInfo &CGFI =
2638	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2639	auto *Fn = llvm::Function::Create(
2640	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2641	"_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
2642	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2643	Fn->setDoesNotRecurse();
2644	CodeGenFunction CGF(CGM);
2645	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2646
2647	CGBuilderTy &Bld = CGF.Builder;
2648
2649	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2650	QualType StaticTy = C.getRecordType(TeamReductionRec);
2651	llvm::Type *LLVMReductionsBufferTy =
2652	CGM.getTypes().ConvertTypeForMem(StaticTy);
2653	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2654	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
2655	LLVMReductionsBufferTy->getPointerTo());
2656
2657	// 1. Build a list of reduction variables.
2658	// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2659	Address ReductionList =
2660	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2661	auto IPriv = Privates.begin();
2662	llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2663	CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2664	/Volatile=/false, C.IntTy,
2665	Loc)};
2666	unsigned Idx = 0;
2667	for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2668	Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2669	// Global = Buffer.VD[Idx];
2670	const ValueDecl VD = cast<DeclRefExpr>(IPriv)->getDecl();
2671	const FieldDecl *FD = VarFieldMap.lookup(VD);
2672	LValue GlobLVal = CGF.EmitLValueForField(
2673	CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2674	Address GlobAddr = GlobLVal.getAddress(CGF);
2675	llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2676	GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2677	llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2678	CGF.EmitStoreOfScalar(Ptr, Elem, /Volatile=/false, C.VoidPtrTy);
2679	if ((*IPriv)->getType()->isVariablyModifiedType()) {
2680	// Store array size.
2681	++Idx;
2682	Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2683	llvm::Value *Size = CGF.Builder.CreateIntCast(
2684	CGF.getVLASize(
2685	CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2686	.NumElts,
2687	CGF.SizeTy, /isSigned=/false);
2688	CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2689	Elem);
2690	}
2691	}
2692
2693	// Call reduce_function(GlobalReduceList, ReduceList)
2694	llvm::Value *GlobalReduceList =
2695	CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2696	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2697	llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2698	AddrReduceListArg, /Volatile=/false, C.VoidPtrTy, Loc);
2699	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2700	CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
2701	CGF.FinishFunction();
2702	return Fn;
2703	}
2704
2705	/// This function emits a helper that copies all the reduction variables from
2706	/// the team into the provided global buffer for the reduction variables.
2707	///
2708	/// void list_to_global_copy_func(void buffer, int Idx, void reduce_data)
2709	/// For all data entries D in reduce_data:
2710	/// Copy buffer.D[Idx] to local D;
2711	static llvm::Value *emitGlobalToListCopyFunction(
2712	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2713	QualType ReductionArrayTy, SourceLocation Loc,
2714	const RecordDecl *TeamReductionRec,
2715	const llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
2716	&VarFieldMap) {
2717	ASTContext &C = CGM.getContext();
2718
2719	// Buffer: global reduction buffer.
2720	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2721	C.VoidPtrTy, ImplicitParamDecl::Other);
2722	// Idx: index of the buffer.
2723	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
2724	ImplicitParamDecl::Other);
2725	// ReduceList: thread local Reduce list.
2726	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2727	C.VoidPtrTy, ImplicitParamDecl::Other);
2728	FunctionArgList Args;
2729	Args.push_back(&BufferArg);
2730	Args.push_back(&IdxArg);
2731	Args.push_back(&ReduceListArg);
2732
2733	const CGFunctionInfo &CGFI =
2734	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2735	auto *Fn = llvm::Function::Create(
2736	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2737	"_omp_reduction_global_to_list_copy_func", &CGM.getModule());
2738	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2739	Fn->setDoesNotRecurse();
2740	CodeGenFunction CGF(CGM);
2741	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2742
2743	CGBuilderTy &Bld = CGF.Builder;
2744
2745	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2746	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2747	Address LocalReduceList(
2748	Bld.CreatePointerBitCastOrAddrSpaceCast(
2749	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
2750	C.VoidPtrTy, Loc),
2751	CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2752	CGF.getPointerAlign());
2753	QualType StaticTy = C.getRecordType(TeamReductionRec);
2754	llvm::Type *LLVMReductionsBufferTy =
2755	CGM.getTypes().ConvertTypeForMem(StaticTy);
2756	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2757	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
2758	LLVMReductionsBufferTy->getPointerTo());
2759
2760	llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2761	CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2762	/Volatile=/false, C.IntTy,
2763	Loc)};
2764	unsigned Idx = 0;
2765	for (const Expr *Private : Privates) {
2766	// Reduce element = LocalReduceList[i]
2767	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2768	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2769	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation());
2770	// elemptr = ((CopyType*)(elemptrptr)) + I
2771	ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2772	ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
2773	Address ElemPtr =
2774	Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
2775	const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2776	// Global = Buffer.VD[Idx];
2777	const FieldDecl *FD = VarFieldMap.lookup(VD);
2778	LValue GlobLVal = CGF.EmitLValueForField(
2779	CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2780	Address GlobAddr = GlobLVal.getAddress(CGF);
2781	llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2782	GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2783	GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
2784	switch (CGF.getEvaluationKind(Private->getType())) {
2785	case TEK_Scalar: {
2786	llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
2787	CGF.EmitStoreOfScalar(V, ElemPtr, /Volatile=/false, Private->getType(),
2788	LValueBaseInfo(AlignmentSource::Type),
2789	TBAAAccessInfo());
2790	break;
2791	}
2792	case TEK_Complex: {
2793	CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
2794	CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2795	/isInit=/false);
2796	break;
2797	}
2798	case TEK_Aggregate:
2799	CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2800	GlobLVal, Private->getType(),
2801	AggValueSlot::DoesNotOverlap);
2802	break;
2803	}
2804	++Idx;
2805	}
2806
2807	CGF.FinishFunction();
2808	return Fn;
2809	}
2810
2811	/// This function emits a helper that reduces all the reduction variables from
2812	/// the team into the provided global buffer for the reduction variables.
2813	///
2814	/// void global_to_list_reduce_func(void buffer, int Idx, void reduce_data)
2815	/// void *GlobPtrs[];
2816	/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2817	/// ...
2818	/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2819	/// reduce_function(reduce_data, GlobPtrs);
2820	static llvm::Value *emitGlobalToListReduceFunction(
2821	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2822	QualType ReductionArrayTy, SourceLocation Loc,
2823	const RecordDecl *TeamReductionRec,
2824	const llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
2825	&VarFieldMap,
2826	llvm::Function *ReduceFn) {
2827	ASTContext &C = CGM.getContext();
2828
2829	// Buffer: global reduction buffer.
2830	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2831	C.VoidPtrTy, ImplicitParamDecl::Other);
2832	// Idx: index of the buffer.
2833	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
2834	ImplicitParamDecl::Other);
2835	// ReduceList: thread local Reduce list.
2836	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2837	C.VoidPtrTy, ImplicitParamDecl::Other);
2838	FunctionArgList Args;
2839	Args.push_back(&BufferArg);
2840	Args.push_back(&IdxArg);
2841	Args.push_back(&ReduceListArg);
2842
2843	const CGFunctionInfo &CGFI =
2844	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2845	auto *Fn = llvm::Function::Create(
2846	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2847	"_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
2848	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2849	Fn->setDoesNotRecurse();
2850	CodeGenFunction CGF(CGM);
2851	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2852
2853	CGBuilderTy &Bld = CGF.Builder;
2854
2855	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2856	QualType StaticTy = C.getRecordType(TeamReductionRec);
2857	llvm::Type *LLVMReductionsBufferTy =
2858	CGM.getTypes().ConvertTypeForMem(StaticTy);
2859	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2860	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
2861	LLVMReductionsBufferTy->getPointerTo());
2862
2863	// 1. Build a list of reduction variables.
2864	// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2865	Address ReductionList =
2866	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2867	auto IPriv = Privates.begin();
2868	llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2869	CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2870	/Volatile=/false, C.IntTy,
2871	Loc)};
2872	unsigned Idx = 0;
2873	for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2874	Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2875	// Global = Buffer.VD[Idx];
2876	const ValueDecl VD = cast<DeclRefExpr>(IPriv)->getDecl();
2877	const FieldDecl *FD = VarFieldMap.lookup(VD);
2878	LValue GlobLVal = CGF.EmitLValueForField(
2879	CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2880	Address GlobAddr = GlobLVal.getAddress(CGF);
2881	llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2882	GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2883	llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2884	CGF.EmitStoreOfScalar(Ptr, Elem, /Volatile=/false, C.VoidPtrTy);
2885	if ((*IPriv)->getType()->isVariablyModifiedType()) {
2886	// Store array size.
2887	++Idx;
2888	Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2889	llvm::Value *Size = CGF.Builder.CreateIntCast(
2890	CGF.getVLASize(
2891	CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2892	.NumElts,
2893	CGF.SizeTy, /isSigned=/false);
2894	CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2895	Elem);
2896	}
2897	}
2898
2899	// Call reduce_function(ReduceList, GlobalReduceList)
2900	llvm::Value *GlobalReduceList =
2901	CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2902	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2903	llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2904	AddrReduceListArg, /Volatile=/false, C.VoidPtrTy, Loc);
2905	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2906	CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
2907	CGF.FinishFunction();
2908	return Fn;
2909	}
2910
2911	///
2912	/// Design of OpenMP reductions on the GPU
2913	///
2914	/// Consider a typical OpenMP program with one or more reduction
2915	/// clauses:
2916	///
2917	/// float foo;
2918	/// double bar;
2919	/// #pragma omp target teams distribute parallel for \
2920	/// reduction(+:foo) reduction(*:bar)
2921	/// for (int i = 0; i < N; i++) {
2922	/// foo += A[i]; bar *= B[i];
2923	/// }
2924	///
2925	/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2926	/// all teams. In our OpenMP implementation on the NVPTX device an
2927	/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2928	/// within a team are mapped to CUDA threads within a threadblock.
2929	/// Our goal is to efficiently aggregate values across all OpenMP
2930	/// threads such that:
2931	///
2932	/// - the compiler and runtime are logically concise, and
2933	/// - the reduction is performed efficiently in a hierarchical
2934	/// manner as follows: within OpenMP threads in the same warp,
2935	/// across warps in a threadblock, and finally across teams on
2936	/// the NVPTX device.
2937	///
2938	/// Introduction to Decoupling
2939	///
2940	/// We would like to decouple the compiler and the runtime so that the
2941	/// latter is ignorant of the reduction variables (number, data types)
2942	/// and the reduction operators. This allows a simpler interface
2943	/// and implementation while still attaining good performance.
2944	///
2945	/// Pseudocode for the aforementioned OpenMP program generated by the
2946	/// compiler is as follows:
2947	///
2948	/// 1. Create private copies of reduction variables on each OpenMP
2949	/// thread: 'foo_private', 'bar_private'
2950	/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2951	/// to it and writes the result in 'foo_private' and 'bar_private'
2952	/// respectively.
2953	/// 3. Call the OpenMP runtime on the GPU to reduce within a team
2954	/// and store the result on the team master:
2955	///
2956	/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2957	/// reduceData, shuffleReduceFn, interWarpCpyFn)
2958	///
2959	/// where:
2960	/// struct ReduceData {
2961	/// double *foo;
2962	/// double *bar;
2963	/// } reduceData
2964	/// reduceData.foo = &foo_private
2965	/// reduceData.bar = &bar_private
2966	///
2967	/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2968	/// auxiliary functions generated by the compiler that operate on
2969	/// variables of type 'ReduceData'. They aid the runtime perform
2970	/// algorithmic steps in a data agnostic manner.
2971	///
2972	/// 'shuffleReduceFn' is a pointer to a function that reduces data
2973	/// of type 'ReduceData' across two OpenMP threads (lanes) in the
2974	/// same warp. It takes the following arguments as input:
2975	///
2976	/// a. variable of type 'ReduceData' on the calling lane,
2977	/// b. its lane_id,
2978	/// c. an offset relative to the current lane_id to generate a
2979	/// remote_lane_id. The remote lane contains the second
2980	/// variable of type 'ReduceData' that is to be reduced.
2981	/// d. an algorithm version parameter determining which reduction
2982	/// algorithm to use.
2983	///
2984	/// 'shuffleReduceFn' retrieves data from the remote lane using
2985	/// efficient GPU shuffle intrinsics and reduces, using the
2986	/// algorithm specified by the 4th parameter, the two operands
2987	/// element-wise. The result is written to the first operand.
2988	///
2989	/// Different reduction algorithms are implemented in different
2990	/// runtime functions, all calling 'shuffleReduceFn' to perform
2991	/// the essential reduction step. Therefore, based on the 4th
2992	/// parameter, this function behaves slightly differently to
2993	/// cooperate with the runtime to ensure correctness under
2994	/// different circumstances.
2995	///
2996	/// 'InterWarpCpyFn' is a pointer to a function that transfers
2997	/// reduced variables across warps. It tunnels, through CUDA
2998	/// shared memory, the thread-private data of type 'ReduceData'
2999	/// from lane 0 of each warp to a lane in the first warp.
3000	/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3001	/// The last team writes the global reduced value to memory.
3002	///
3003	/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
3004	/// reduceData, shuffleReduceFn, interWarpCpyFn,
3005	/// scratchpadCopyFn, loadAndReduceFn)
3006	///
3007	/// 'scratchpadCopyFn' is a helper that stores reduced
3008	/// data from the team master to a scratchpad array in
3009	/// global memory.
3010	///
3011	/// 'loadAndReduceFn' is a helper that loads data from
3012	/// the scratchpad array and reduces it with the input
3013	/// operand.
3014	///
3015	/// These compiler generated functions hide address
3016	/// calculation and alignment information from the runtime.
3017	/// 5. if ret == 1:
3018	/// The team master of the last team stores the reduced
3019	/// result to the globals in memory.
3020	/// foo += reduceData.foo; bar *= reduceData.bar
3021	///
3022	///
3023	/// Warp Reduction Algorithms
3024	///
3025	/// On the warp level, we have three algorithms implemented in the
3026	/// OpenMP runtime depending on the number of active lanes:
3027	///
3028	/// Full Warp Reduction
3029	///
3030	/// The reduce algorithm within a warp where all lanes are active
3031	/// is implemented in the runtime as follows:
3032	///
3033	/// full_warp_reduce(void *reduce_data,
3034	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3035	/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3036	/// ShuffleReduceFn(reduce_data, 0, offset, 0);
3037	/// }
3038	///
3039	/// The algorithm completes in log(2, WARPSIZE) steps.
3040	///
3041	/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3042	/// not used therefore we save instructions by not retrieving lane_id
3043	/// from the corresponding special registers. The 4th parameter, which
3044	/// represents the version of the algorithm being used, is set to 0 to
3045	/// signify full warp reduction.
3046	///
3047	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3048	///
3049	/// #reduce_elem refers to an element in the local lane's data structure
3050	/// #remote_elem is retrieved from a remote lane
3051	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3052	/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3053	///
3054	/// Contiguous Partial Warp Reduction
3055	///
3056	/// This reduce algorithm is used within a warp where only the first
3057	/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
3058	/// number of OpenMP threads in a parallel region is not a multiple of
3059	/// WARPSIZE. The algorithm is implemented in the runtime as follows:
3060	///
3061	/// void
3062	/// contiguous_partial_reduce(void *reduce_data,
3063	/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
3064	/// int size, int lane_id) {
3065	/// int curr_size;
3066	/// int offset;
3067	/// curr_size = size;
3068	/// mask = curr_size/2;
3069	/// while (offset>0) {
3070	/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3071	/// curr_size = (curr_size+1)/2;
3072	/// offset = curr_size/2;
3073	/// }
3074	/// }
3075	///
3076	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3077	///
3078	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3079	/// if (lane_id < offset)
3080	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3081	/// else
3082	/// reduce_elem = remote_elem
3083	///
3084	/// This algorithm assumes that the data to be reduced are located in a
3085	/// contiguous subset of lanes starting from the first. When there is
3086	/// an odd number of active lanes, the data in the last lane is not
3087	/// aggregated with any other lane's dat but is instead copied over.
3088	///
3089	/// Dispersed Partial Warp Reduction
3090	///
3091	/// This algorithm is used within a warp when any discontiguous subset of
3092	/// lanes are active. It is used to implement the reduction operation
3093	/// across lanes in an OpenMP simd region or in a nested parallel region.
3094	///
3095	/// void
3096	/// dispersed_partial_reduce(void *reduce_data,
3097	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3098	/// int size, remote_id;
3099	/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
3100	/// do {
3101	/// remote_id = next_active_lane_id_right_after_me();
3102	/// # the above function returns 0 of no active lane
3103	/// # is present right after the current lane.
3104	/// size = number_of_active_lanes_in_this_warp();
3105	/// logical_lane_id /= 2;
3106	/// ShuffleReduceFn(reduce_data, logical_lane_id,
3107	/// remote_id-1-threadIdx.x, 2);
3108	/// } while (logical_lane_id % 2 == 0 && size > 1);
3109	/// }
3110	///
3111	/// There is no assumption made about the initial state of the reduction.
3112	/// Any number of lanes (>=1) could be active at any position. The reduction
3113	/// result is returned in the first active lane.
3114	///
3115	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3116	///
3117	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3118	/// if (lane_id % 2 == 0 && offset > 0)
3119	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3120	/// else
3121	/// reduce_elem = remote_elem
3122	///
3123	///
3124	/// Intra-Team Reduction
3125	///
3126	/// This function, as implemented in the runtime call
3127	/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
3128	/// threads in a team. It first reduces within a warp using the
3129	/// aforementioned algorithms. We then proceed to gather all such
3130	/// reduced values at the first warp.
3131	///
3132	/// The runtime makes use of the function 'InterWarpCpyFn', which copies
3133	/// data from each of the "warp master" (zeroth lane of each warp, where
3134	/// warp-reduced data is held) to the zeroth warp. This step reduces (in
3135	/// a mathematical sense) the problem of reduction across warp masters in
3136	/// a block to the problem of warp reduction.
3137	///
3138	///
3139	/// Inter-Team Reduction
3140	///
3141	/// Once a team has reduced its data to a single value, it is stored in
3142	/// a global scratchpad array. Since each team has a distinct slot, this
3143	/// can be done without locking.
3144	///
3145	/// The last team to write to the scratchpad array proceeds to reduce the
3146	/// scratchpad array. One or more workers in the last team use the helper
3147	/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3148	/// the k'th worker reduces every k'th element.
3149	///
3150	/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
3151	/// reduce across workers and compute a globally reduced value.
3152	///
3153	void CGOpenMPRuntimeGPU::emitReduction(
3154	CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
3155	ArrayRef<const Expr > LHSExprs, ArrayRef<const Expr > RHSExprs,
3156	ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3157	if (!CGF.HaveInsertPoint())
3158	return;
3159
3160	bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
3161	#ifndef NDEBUG1
3162	bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
3163	#endif
3164
3165	if (Options.SimpleReduction) {
3166	assert(!TeamsReduction && !ParallelReduction &&((void)0)
3167	"Invalid reduction selection in emitReduction.")((void)0);
3168	CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3169	ReductionOps, Options);
3170	return;
3171	}
3172
3173	assert((TeamsReduction \|\| ParallelReduction) &&((void)0)
3174	"Invalid reduction selection in emitReduction.")((void)0);
3175
3176	// Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3177	// RedList, shuffle_reduce_func, interwarp_copy_func);
3178	// or
3179	// Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3180	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
3181	llvm::Value *ThreadId = getThreadID(CGF, Loc);
3182
3183	llvm::Value *Res;
3184	ASTContext &C = CGM.getContext();
3185	// 1. Build a list of reduction variables.
3186	// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3187	auto Size = RHSExprs.size();
3188	for (const Expr *E : Privates) {
3189	if (E->getType()->isVariablyModifiedType())
3190	// Reserve place for array size.
3191	++Size;
3192	}
3193	llvm::APInt ArraySize(/unsigned int numBits=/32, Size);
3194	QualType ReductionArrayTy =
3195	C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
3196	/IndexTypeQuals=/0);
3197	Address ReductionList =
3198	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3199	auto IPriv = Privates.begin();
3200	unsigned Idx = 0;
3201	for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
3202	Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3203	CGF.Builder.CreateStore(
3204	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3205	CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
3206	Elem);
3207	if ((*IPriv)->getType()->isVariablyModifiedType()) {
3208	// Store array size.
3209	++Idx;
3210	Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3211	llvm::Value *Size = CGF.Builder.CreateIntCast(
3212	CGF.getVLASize(
3213	CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3214	.NumElts,
3215	CGF.SizeTy, /isSigned=/false);
3216	CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3217	Elem);
3218	}
3219	}
3220
3221	llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3222	ReductionList.getPointer(), CGF.VoidPtrTy);
3223	llvm::Function *ReductionFn = emitReductionFunction(
3224	Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
3225	LHSExprs, RHSExprs, ReductionOps);
3226	llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3227	llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
3228	CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3229	llvm::Value *InterWarpCopyFn =
3230	emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
3231
3232	if (ParallelReduction) {
3233	llvm::Value *Args[] = {RTLoc,
3234	ThreadId,
3235	CGF.Builder.getInt32(RHSExprs.size()),
3236	ReductionArrayTySize,
3237	RL,
3238	ShuffleAndReduceFn,
3239	InterWarpCopyFn};
3240
3241	Res = CGF.EmitRuntimeCall(
3242	OMPBuilder.getOrCreateRuntimeFunction(
3243	CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
3244	Args);
3245	} else {
3246	assert(TeamsReduction && "expected teams reduction.")((void)0);
3247	llvm::SmallDenseMap<const ValueDecl , const FieldDecl > VarFieldMap;
3248	llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
3249	int Cnt = 0;
3250	for (const Expr *DRE : Privates) {
3251	PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
3252	++Cnt;
3253	}
3254	const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
3255	CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
3256	C.getLangOpts().OpenMPCUDAReductionBufNum);
3257	TeamsReductions.push_back(TeamReductionRec);
3258	if (!KernelTeamsReductionPtr) {
3259	KernelTeamsReductionPtr = new llvm::GlobalVariable(
3260	CGM.getModule(), CGM.VoidPtrTy, /isConstant=/true,
3261	llvm::GlobalValue::InternalLinkage, nullptr,
3262	"_openmp_teams_reductions_buffer_$_$ptr");
3263	}
3264	llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
3265	Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
3266	/Volatile=/false, C.getPointerType(C.VoidPtrTy), Loc);
3267	llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
3268	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3269	llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
3270	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3271	ReductionFn);
3272	llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
3273	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3274	llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
3275	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3276	ReductionFn);
3277
3278	llvm::Value *Args[] = {
3279	RTLoc,
3280	ThreadId,
3281	GlobalBufferPtr,
3282	CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
3283	RL,
3284	ShuffleAndReduceFn,
3285	InterWarpCopyFn,
3286	GlobalToBufferCpyFn,
3287	GlobalToBufferRedFn,
3288	BufferToGlobalCpyFn,
3289	BufferToGlobalRedFn};
3290
3291	Res = CGF.EmitRuntimeCall(
3292	OMPBuilder.getOrCreateRuntimeFunction(
3293	CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
3294	Args);
3295	}
3296
3297	// 5. Build if (res == 1)
3298	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3299	llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3300	llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3301	Res, llvm::ConstantInt::get(CGM.Int32Ty, /V=/1));
3302	CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3303
3304	// 6. Build then branch: where we have reduced values in the master
3305	// thread in each team.
3306	// __kmpc_end_reduce{_nowait}(<gtid>);
3307	// break;
3308	CGF.EmitBlock(ThenBB);
3309
3310	// Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3311	auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3312	this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3313	auto IPriv = Privates.begin();
3314	auto ILHS = LHSExprs.begin();
3315	auto IRHS = RHSExprs.begin();
3316	for (const Expr *E : ReductionOps) {
3317	emitSingleReductionCombiner(CGF, E, IPriv, cast<DeclRefExpr>(ILHS),
3318	cast<DeclRefExpr>(*IRHS));
3319	++IPriv;
3320	++ILHS;
3321	++IRHS;
3322	}
3323	};
3324	llvm::Value *EndArgs[] = {ThreadId};
3325	RegionCodeGenTy RCG(CodeGen);
3326	NVPTXActionTy Action(
3327	nullptr, llvm::None,
3328	OMPBuilder.getOrCreateRuntimeFunction(
3329	CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3330	EndArgs);
3331	RCG.setAction(Action);
3332	RCG(CGF);
3333	// There is no need to emit line number for unconditional branch.
3334	(void)ApplyDebugLocation::CreateEmpty(CGF);
3335	CGF.EmitBlock(ExitBB, /IsFinished=/true);
3336	}
3337
3338	const VarDecl *
3339	CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
3340	const VarDecl *NativeParam) const {
3341	if (!NativeParam->getType()->isReferenceType())
3342	return NativeParam;
3343	QualType ArgType = NativeParam->getType();
3344	QualifierCollector QC;
3345	const Type *NonQualTy = QC.strip(ArgType);
3346	QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3347	if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3348	if (Attr->getCaptureKind() == OMPC_map) {
3349	PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3350	LangAS::opencl_global);
3351	}
3352	}
3353	ArgType = CGM.getContext().getPointerType(PointeeTy);
3354	QC.addRestrict();
3355	enum { NVPTX_local_addr = 5 };
3356	QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3357	ArgType = QC.apply(CGM.getContext(), ArgType);
3358	if (isa<ImplicitParamDecl>(NativeParam))
3359	return ImplicitParamDecl::Create(
3360	CGM.getContext(), /DC=/nullptr, NativeParam->getLocation(),
3361	NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3362	return ParmVarDecl::Create(
3363	CGM.getContext(),
3364	const_cast<DeclContext *>(NativeParam->getDeclContext()),
3365	NativeParam->getBeginLoc(), NativeParam->getLocation(),
3366	NativeParam->getIdentifier(), ArgType,
3367	/TInfo=/nullptr, SC_None, /DefArg=/nullptr);
3368	}
3369
3370	Address
3371	CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
3372	const VarDecl *NativeParam,
3373	const VarDecl *TargetParam) const {
3374	assert(NativeParam != TargetParam &&((void)0)
3375	NativeParam->getType()->isReferenceType() &&((void)0)
3376	"Native arg must not be the same as target arg.")((void)0);
3377	Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3378	QualType NativeParamType = NativeParam->getType();
3379	QualifierCollector QC;
3380	const Type *NonQualTy = QC.strip(NativeParamType);
3381	QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3382	unsigned NativePointeeAddrSpace =
3383	CGF.getContext().getTargetAddressSpace(NativePointeeTy);
3384	QualType TargetTy = TargetParam->getType();
3385	llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3386	LocalAddr, /Volatile=/false, TargetTy, SourceLocation());
3387	// First cast to generic.
3388	TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3389	TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3390	/AddrSpace=/0));
3391	// Cast from generic to native address space.
3392	TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3393	TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3394	NativePointeeAddrSpace));
3395	Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3396	CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /Volatile=/false,
3397	NativeParamType);
3398	return NativeParamAddr;
3399	}
3400
3401	void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3402	CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3403	ArrayRef<llvm::Value *> Args) const {
3404	SmallVector<llvm::Value *, 4> TargetArgs;
3405	TargetArgs.reserve(Args.size());
3406	auto *FnType = OutlinedFn.getFunctionType();
3407	for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3408	if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3409	TargetArgs.append(std::next(Args.begin(), I), Args.end());
3410	break;
3411	}
3412	llvm::Type *TargetType = FnType->getParamType(I);
3413	llvm::Value *NativeArg = Args[I];
3414	if (!TargetType->isPointerTy()) {
3415	TargetArgs.emplace_back(NativeArg);
3416	continue;
3417	}
3418	llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3419	NativeArg,
3420	NativeArg->getType()->getPointerElementType()->getPointerTo());
3421	TargetArgs.emplace_back(
3422	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
3423	}
3424	CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3425	}
3426
3427	/// Emit function which wraps the outline parallel region
3428	/// and controls the arguments which are passed to this function.
3429	/// The wrapper ensures that the outlined function is called
3430	/// with the correct arguments when data is shared.
3431	llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3432	llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3433	ASTContext &Ctx = CGM.getContext();
3434	const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3435
3436	// Create a function that takes as argument the source thread.
3437	FunctionArgList WrapperArgs;
3438	QualType Int16QTy =
3439	Ctx.getIntTypeForBitwidth(/DestWidth=/16, /Signed=/false);
3440	QualType Int32QTy =
3441	Ctx.getIntTypeForBitwidth(/DestWidth=/32, /Signed=/false);
3442	ImplicitParamDecl ParallelLevelArg(Ctx, /DC=/nullptr, D.getBeginLoc(),
3443	/Id=/nullptr, Int16QTy,
3444	ImplicitParamDecl::Other);
3445	ImplicitParamDecl WrapperArg(Ctx, /DC=/nullptr, D.getBeginLoc(),
3446	/Id=/nullptr, Int32QTy,
3447	ImplicitParamDecl::Other);
3448	WrapperArgs.emplace_back(&ParallelLevelArg);
3449	WrapperArgs.emplace_back(&WrapperArg);
3450
3451	const CGFunctionInfo &CGFI =
3452	CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
3453
3454	auto *Fn = llvm::Function::Create(
3455	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3456	Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3457
3458	// Ensure we do not inline the function. This is trivially true for the ones
3459	// passed to __kmpc_fork_call but the ones calles in serialized regions
3460	// could be inlined. This is not a perfect but it is closer to the invariant
3461	// we want, namely, every data environment starts with a new function.
3462	// TODO: We should pass the if condition to the runtime function and do the
3463	// handling there. Much cleaner code.
3464	Fn->addFnAttr(llvm::Attribute::NoInline);
3465
3466	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3467	Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3468	Fn->setDoesNotRecurse();
3469
3470	CodeGenFunction CGF(CGM, /suppressNewContext=/true);
3471	CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3472	D.getBeginLoc(), D.getBeginLoc());
3473
3474	const auto *RD = CS.getCapturedRecordDecl();
3475	auto CurField = RD->field_begin();
3476
3477	Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
3478	/Name=/".zero.addr");
3479	CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/C/ 0));
3480	// Get the array of arguments.
3481	SmallVector<llvm::Value *, 8> Args;
3482
3483	Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3484	Args.emplace_back(ZeroAddr.getPointer());
3485
3486	CGBuilderTy &Bld = CGF.Builder;
3487	auto CI = CS.capture_begin();
3488
3489	// Use global memory for data sharing.
3490	// Handle passing of global args to workers.
3491	Address GlobalArgs =
3492	CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3493	llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3494	llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3495	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3496	CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
3497	DataSharingArgs);
3498
3499	// Retrieve the shared variables from the list of references returned
3500	// by the runtime. Pass the variables to the outlined function.
3501	Address SharedArgListAddress = Address::invalid();
3502	if (CS.capture_size() > 0 \|\|
3503	isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3504	SharedArgListAddress = CGF.EmitLoadOfPointer(
3505	GlobalArgs, CGF.getContext()
3506	.getPointerType(CGF.getContext().getPointerType(
3507	CGF.getContext().VoidPtrTy))
3508	.castAs<PointerType>());
3509	}
3510	unsigned Idx = 0;
3511	if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3512	Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3513	Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3514	Src, CGF.SizeTy->getPointerTo());
3515	llvm::Value *LB = CGF.EmitLoadOfScalar(
3516	TypedAddress,
3517	/Volatile=/false,
3518	CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3519	cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3520	Args.emplace_back(LB);
3521	++Idx;
3522	Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3523	TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3524	Src, CGF.SizeTy->getPointerTo());
3525	llvm::Value *UB = CGF.EmitLoadOfScalar(
3526	TypedAddress,
3527	/Volatile=/false,
3528	CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3529	cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3530	Args.emplace_back(UB);
3531	++Idx;
3532	}
3533	if (CS.capture_size() > 0) {
3534	ASTContext &CGFContext = CGF.getContext();
3535	for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3536	QualType ElemTy = CurField->getType();
3537	Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
3538	Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3539	Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
3540	llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3541	/Volatile=/false,
3542	CGFContext.getPointerType(ElemTy),
3543	CI->getLocation());
3544	if (CI->capturesVariableByCopy() &&
3545	!CI->getCapturedVar()->getType()->isAnyPointerType()) {
3546	Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3547	CI->getLocation());
3548	}
3549	Args.emplace_back(Arg);
3550	}
3551	}
3552
3553	emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
3554	CGF.FinishFunction();
3555	return Fn;
3556	}
3557
3558	void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
3559	const Decl *D) {
3560	if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3561	return;
3562
3563	assert(D && "Expected function or captured\|block decl.")((void)0);
3564	assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&((void)0)
3565	"Function is registered already.")((void)0);
3566	assert((!TeamAndReductions.first \|\| TeamAndReductions.first == D) &&((void)0)
3567	"Team is set but not processed.")((void)0);
3568	const Stmt *Body = nullptr;
3569	bool NeedToDelayGlobalization = false;
3570	if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3571	Body = FD->getBody();
3572	} else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3573	Body = BD->getBody();
3574	} else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3575	Body = CD->getBody();
3576	NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3577	if (NeedToDelayGlobalization &&
3578	getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3579	return;
3580	}
3581	if (!Body)
3582	return;
3583	CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3584	VarChecker.Visit(Body);
3585	const RecordDecl *GlobalizedVarsRecord =
3586	VarChecker.getGlobalizedRecord(IsInTTDRegion);
3587	TeamAndReductions.first = nullptr;
3588	TeamAndReductions.second.clear();
3589	ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3590	VarChecker.getEscapedVariableLengthDecls();
3591	if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
3592	return;
3593	auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3594	I->getSecond().MappedParams =
3595	std::make_unique<CodeGenFunction::OMPMapVars>();
3596	I->getSecond().EscapedParameters.insert(
3597	VarChecker.getEscapedParameters().begin(),
3598	VarChecker.getEscapedParameters().end());
3599	I->getSecond().EscapedVariableLengthDecls.append(
3600	EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3601	DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3602	for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3603	assert(VD->isCanonicalDecl() && "Expected canonical declaration")((void)0);
3604	Data.insert(std::make_pair(VD, MappedVarData()));
3605	}
3606	if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
3607	CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
3608	VarChecker.Visit(Body);
3609	I->getSecond().SecondaryLocalVarData.emplace();
3610	DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
3611	for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3612	assert(VD->isCanonicalDecl() && "Expected canonical declaration")((void)0);
3613	Data.insert(std::make_pair(VD, MappedVarData()));
3614	}
3615	}
3616	if (!NeedToDelayGlobalization) {
3617	emitGenericVarsProlog(CGF, D->getBeginLoc(), /WithSPMDCheck=/true);
3618	struct GlobalizationScope final : EHScopeStack::Cleanup {
3619	GlobalizationScope() = default;
3620
3621	void Emit(CodeGenFunction &CGF, Flags flags) override {
3622	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3623	.emitGenericVarsEpilog(CGF, /WithSPMDCheck=/true);
3624	}
3625	};
3626	CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3627	}
3628	}
3629
3630	Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
3631	const VarDecl *VD) {
3632	if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3633	const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3634	auto AS = LangAS::Default;
3635	switch (A->getAllocatorType()) {
3636	// Use the default allocator here as by default local vars are
3637	// threadlocal.
3638	case OMPAllocateDeclAttr::OMPNullMemAlloc:
3639	case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3640	case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3641	case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3642	case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3643	// Follow the user decision - use default allocation.
3644	return Address::invalid();
3645	case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3646	// TODO: implement aupport for user-defined allocators.
3647	return Address::invalid();
3648	case OMPAllocateDeclAttr::OMPConstMemAlloc:
3649	AS = LangAS::cuda_constant;
3650	break;
3651	case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3652	AS = LangAS::cuda_shared;
3653	break;
3654	case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3655	case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3656	break;
3657	}
3658	llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
3659	auto *GV = new llvm::GlobalVariable(
3660	CGM.getModule(), VarTy, /isConstant=/false,
3661	llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
3662	VD->getName(),
3663	/InsertBefore=/nullptr, llvm::GlobalValue::NotThreadLocal,
3664	CGM.getContext().getTargetAddressSpace(AS));
3665	CharUnits Align = CGM.getContext().getDeclAlign(VD);
3666	GV->setAlignment(Align.getAsAlign());
3667	return Address(
3668	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3669	GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
3670	VD->getType().getAddressSpace()))),
3671	Align);
3672	}
3673
3674	if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3675	return Address::invalid();
3676
3677	VD = VD->getCanonicalDecl();
3678	auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3679	if (I == FunctionGlobalizedDecls.end())
3680	return Address::invalid();
3681	auto VDI = I->getSecond().LocalVarData.find(VD);
3682	if (VDI != I->getSecond().LocalVarData.end())
3683	return VDI->second.PrivateAddr;
3684	if (VD->hasAttrs()) {
3685	for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
3686	E(VD->attr_end());
3687	IT != E; ++IT) {
3688	auto VDI = I->getSecond().LocalVarData.find(
3689	cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3690	->getCanonicalDecl());
3691	if (VDI != I->getSecond().LocalVarData.end())
3692	return VDI->second.PrivateAddr;
3693	}
3694	}
3695
3696	return Address::invalid();
3697	}
3698
3699	void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
3700	FunctionGlobalizedDecls.erase(CGF.CurFn);
3701	CGOpenMPRuntime::functionFinished(CGF);
3702	}
3703
3704	void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
3705	CodeGenFunction &CGF, const OMPLoopDirective &S,
3706	OpenMPDistScheduleClauseKind &ScheduleKind,
3707	llvm::Value *&Chunk) const {
3708	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3709	if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3710	ScheduleKind = OMPC_DIST_SCHEDULE_static;
3711	Chunk = CGF.EmitScalarConversion(
3712	RT.getGPUNumThreads(CGF),
3713	CGF.getContext().getIntTypeForBitwidth(32, /Signed=/0),
3714	S.getIterationVariable()->getType(), S.getBeginLoc());
3715	return;
3716	}
3717	CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
3718	CGF, S, ScheduleKind, Chunk);
3719	}
3720
3721	void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
3722	CodeGenFunction &CGF, const OMPLoopDirective &S,
3723	OpenMPScheduleClauseKind &ScheduleKind,
3724	const Expr *&ChunkExpr) const {
3725	ScheduleKind = OMPC_SCHEDULE_static;
3726	// Chunk size is 1 in this case.
3727	llvm::APInt ChunkSize(32, 1);
3728	ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
3729	CGF.getContext().getIntTypeForBitwidth(32, /Signed=/0),
3730	SourceLocation());
3731	}
3732
3733	void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
3734	CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3735	assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&((void)0)
3736	" Expected target-based directive.")((void)0);
3737	const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3738	for (const CapturedStmt::Capture &C : CS->captures()) {
3739	// Capture variables captured by reference in lambdas for target-based
3740	// directives.
3741	if (!C.capturesVariable())
3742	continue;
3743	const VarDecl *VD = C.getCapturedVar();
3744	const auto *RD = VD->getType()
3745	.getCanonicalType()
3746	.getNonReferenceType()
3747	->getAsCXXRecordDecl();
3748	if (!RD \|\| !RD->isLambda())
3749	continue;
3750	Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3751	LValue VDLVal;
3752	if (VD->getType().getCanonicalType()->isReferenceType())
3753	VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3754	else
3755	VDLVal = CGF.MakeAddrLValue(
3756	VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3757	llvm::DenseMap<const VarDecl , FieldDecl > Captures;
3758	FieldDecl *ThisCapture = nullptr;
3759	RD->getCaptureFields(Captures, ThisCapture);
3760	if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3761	LValue ThisLVal =
3762	CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
3763	llvm::Value *CXXThis = CGF.LoadCXXThis();
3764	CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
3765	}
3766	for (const LambdaCapture &LC : RD->captures()) {
3767	if (LC.getCaptureKind() != LCK_ByRef)
3768	continue;
3769	const VarDecl *VD = LC.getCapturedVar();
3770	if (!CS->capturesVariable(VD))
3771	continue;
3772	auto It = Captures.find(VD);
3773	assert(It != Captures.end() && "Found lambda capture without field.")((void)0);
3774	LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3775	Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3776	if (VD->getType().getCanonicalType()->isReferenceType())
3777	VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3778	VD->getType().getCanonicalType())
3779	.getAddress(CGF);
3780	CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
3781	}
3782	}
3783	}
3784
3785	bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
3786	LangAS &AS) {
3787	if (!VD \|\| !VD->hasAttr<OMPAllocateDeclAttr>())
3788	return false;
3789	const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3790	switch(A->getAllocatorType()) {
3791	case OMPAllocateDeclAttr::OMPNullMemAlloc:
3792	case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3793	// Not supported, fallback to the default mem space.
3794	case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3795	case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3796	case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3797	case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3798	case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3799	AS = LangAS::Default;
3800	return true;
3801	case OMPAllocateDeclAttr::OMPConstMemAlloc:
3802	AS = LangAS::cuda_constant;
3803	return true;
3804	case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3805	AS = LangAS::cuda_shared;
3806	return true;
3807	case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3808	llvm_unreachable("Expected predefined allocator for the variables with the "__builtin_unreachable()
3809	"static storage.")__builtin_unreachable();
3810	}
3811	return false;
3812	}
3813
3814	// Get current CudaArch and ignore any unknown values
3815	static CudaArch getCudaArch(CodeGenModule &CGM) {
3816	if (!CGM.getTarget().hasFeature("ptx"))
3817	return CudaArch::UNKNOWN;
3818	for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3819	if (Feature.getValue()) {
3820	CudaArch Arch = StringToCudaArch(Feature.getKey());
3821	if (Arch != CudaArch::UNKNOWN)
3822	return Arch;
3823	}
3824	}
3825	return CudaArch::UNKNOWN;
3826	}
3827
3828	/// Check to see if target architecture supports unified addressing which is
3829	/// a restriction for OpenMP requires clause "unified_shared_memory".
3830	void CGOpenMPRuntimeGPU::processRequiresDirective(
3831	const OMPRequiresDecl *D) {
3832	for (const OMPClause *Clause : D->clauselists()) {
3833	if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3834	CudaArch Arch = getCudaArch(CGM);
3835	switch (Arch) {
3836	case CudaArch::SM_20:
3837	case CudaArch::SM_21:
3838	case CudaArch::SM_30:
3839	case CudaArch::SM_32:
3840	case CudaArch::SM_35:
3841	case CudaArch::SM_37:
3842	case CudaArch::SM_50:
3843	case CudaArch::SM_52:
3844	case CudaArch::SM_53: {
3845	SmallString<256> Buffer;
3846	llvm::raw_svector_ostream Out(Buffer);
3847	Out << "Target architecture " << CudaArchToString(Arch)
3848	<< " does not support unified addressing";
3849	CGM.Error(Clause->getBeginLoc(), Out.str());
3850	return;
3851	}
3852	case CudaArch::SM_60:
3853	case CudaArch::SM_61:
3854	case CudaArch::SM_62:
3855	case CudaArch::SM_70:
3856	case CudaArch::SM_72:
3857	case CudaArch::SM_75:
3858	case CudaArch::SM_80:
3859	case CudaArch::SM_86:
3860	case CudaArch::GFX600:
3861	case CudaArch::GFX601:
3862	case CudaArch::GFX602:
3863	case CudaArch::GFX700:
3864	case CudaArch::GFX701:
3865	case CudaArch::GFX702:
3866	case CudaArch::GFX703:
3867	case CudaArch::GFX704:
3868	case CudaArch::GFX705:
3869	case CudaArch::GFX801:
3870	case CudaArch::GFX802:
3871	case CudaArch::GFX803:
3872	case CudaArch::GFX805:
3873	case CudaArch::GFX810:
3874	case CudaArch::GFX900:
3875	case CudaArch::GFX902:
3876	case CudaArch::GFX904:
3877	case CudaArch::GFX906:
3878	case CudaArch::GFX908:
3879	case CudaArch::GFX909:
3880	case CudaArch::GFX90a:
3881	case CudaArch::GFX90c:
3882	case CudaArch::GFX1010:
3883	case CudaArch::GFX1011:
3884	case CudaArch::GFX1012:
3885	case CudaArch::GFX1013:
3886	case CudaArch::GFX1030:
3887	case CudaArch::GFX1031:
3888	case CudaArch::GFX1032:
3889	case CudaArch::GFX1033:
3890	case CudaArch::GFX1034:
3891	case CudaArch::GFX1035:
3892	case CudaArch::UNUSED:
3893	case CudaArch::UNKNOWN:
3894	break;
3895	case CudaArch::LAST:
3896	llvm_unreachable("Unexpected Cuda arch.")__builtin_unreachable();
3897	}
3898	}
3899	}
3900	CGOpenMPRuntime::processRequiresDirective(D);
3901	}
3902
3903	void CGOpenMPRuntimeGPU::clear() {
3904
3905	if (!TeamsReductions.empty()) {
3906	ASTContext &C = CGM.getContext();
3907	RecordDecl *StaticRD = C.buildImplicitRecord(
3908	"_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
3909	StaticRD->startDefinition();
3910	for (const RecordDecl *TeamReductionRec : TeamsReductions) {
3911	QualType RecTy = C.getRecordType(TeamReductionRec);
3912	auto *Field = FieldDecl::Create(
3913	C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
3914	C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
3915	/BW=/nullptr, /Mutable=/false,
3916	/InitStyle=/ICIS_NoInit);
3917	Field->setAccess(AS_public);
3918	StaticRD->addDecl(Field);
3919	}
3920	StaticRD->completeDefinition();
3921	QualType StaticTy = C.getRecordType(StaticRD);
3922	llvm::Type *LLVMReductionsBufferTy =
3923	CGM.getTypes().ConvertTypeForMem(StaticTy);
3924	// FIXME: nvlink does not handle weak linkage correctly (object with the
3925	// different size are reported as erroneous).
3926	// Restore CommonLinkage as soon as nvlink is fixed.
3927	auto *GV = new llvm::GlobalVariable(
3928	CGM.getModule(), LLVMReductionsBufferTy,
3929	/isConstant=/false, llvm::GlobalValue::InternalLinkage,
3930	llvm::Constant::getNullValue(LLVMReductionsBufferTy),
3931	"_openmp_teams_reductions_buffer_$_");
3932	KernelTeamsReductionPtr->setInitializer(
3933	llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
3934	CGM.VoidPtrTy));
3935	}
3936	CGOpenMPRuntime::clear();
3937	}