File: | src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Transforms/IPO/OpenMPOpt.cpp |
Warning: | line 2696, column 48 Called C++ object pointer is null |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | // | ||||||
9 | // OpenMP specific optimizations: | ||||||
10 | // | ||||||
11 | // - Deduplication of runtime calls, e.g., omp_get_thread_num. | ||||||
12 | // - Replacing globalized device memory with stack memory. | ||||||
13 | // - Replacing globalized device memory with shared memory. | ||||||
14 | // - Parallel region merging. | ||||||
15 | // - Transforming generic-mode device kernels to SPMD mode. | ||||||
16 | // - Specializing the state machine for generic-mode device kernels. | ||||||
17 | // | ||||||
18 | //===----------------------------------------------------------------------===// | ||||||
19 | |||||||
20 | #include "llvm/Transforms/IPO/OpenMPOpt.h" | ||||||
21 | |||||||
22 | #include "llvm/ADT/EnumeratedArray.h" | ||||||
23 | #include "llvm/ADT/PostOrderIterator.h" | ||||||
24 | #include "llvm/ADT/Statistic.h" | ||||||
25 | #include "llvm/Analysis/CallGraph.h" | ||||||
26 | #include "llvm/Analysis/CallGraphSCCPass.h" | ||||||
27 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" | ||||||
28 | #include "llvm/Analysis/ValueTracking.h" | ||||||
29 | #include "llvm/Frontend/OpenMP/OMPConstants.h" | ||||||
30 | #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" | ||||||
31 | #include "llvm/IR/Assumptions.h" | ||||||
32 | #include "llvm/IR/DiagnosticInfo.h" | ||||||
33 | #include "llvm/IR/GlobalValue.h" | ||||||
34 | #include "llvm/IR/Instruction.h" | ||||||
35 | #include "llvm/IR/IntrinsicInst.h" | ||||||
36 | #include "llvm/InitializePasses.h" | ||||||
37 | #include "llvm/Support/CommandLine.h" | ||||||
38 | #include "llvm/Transforms/IPO.h" | ||||||
39 | #include "llvm/Transforms/IPO/Attributor.h" | ||||||
40 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" | ||||||
41 | #include "llvm/Transforms/Utils/CallGraphUpdater.h" | ||||||
42 | #include "llvm/Transforms/Utils/CodeExtractor.h" | ||||||
43 | |||||||
44 | using namespace llvm; | ||||||
45 | using namespace omp; | ||||||
46 | |||||||
47 | #define DEBUG_TYPE"openmp-opt" "openmp-opt" | ||||||
48 | |||||||
49 | static cl::opt<bool> DisableOpenMPOptimizations( | ||||||
50 | "openmp-opt-disable", cl::ZeroOrMore, | ||||||
51 | cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, | ||||||
52 | cl::init(false)); | ||||||
53 | |||||||
54 | static cl::opt<bool> EnableParallelRegionMerging( | ||||||
55 | "openmp-opt-enable-merging", cl::ZeroOrMore, | ||||||
56 | cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, | ||||||
57 | cl::init(false)); | ||||||
58 | |||||||
59 | static cl::opt<bool> | ||||||
60 | DisableInternalization("openmp-opt-disable-internalization", cl::ZeroOrMore, | ||||||
61 | cl::desc("Disable function internalization."), | ||||||
62 | cl::Hidden, cl::init(false)); | ||||||
63 | |||||||
64 | static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false), | ||||||
65 | cl::Hidden); | ||||||
66 | static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels", | ||||||
67 | cl::init(false), cl::Hidden); | ||||||
68 | |||||||
69 | static cl::opt<bool> HideMemoryTransferLatency( | ||||||
70 | "openmp-hide-memory-transfer-latency", | ||||||
71 | cl::desc("[WIP] Tries to hide the latency of host to device memory" | ||||||
72 | " transfers"), | ||||||
73 | cl::Hidden, cl::init(false)); | ||||||
74 | |||||||
75 | STATISTIC(NumOpenMPRuntimeCallsDeduplicated,static llvm::Statistic NumOpenMPRuntimeCallsDeduplicated = {"openmp-opt" , "NumOpenMPRuntimeCallsDeduplicated", "Number of OpenMP runtime calls deduplicated" } | ||||||
76 | "Number of OpenMP runtime calls deduplicated")static llvm::Statistic NumOpenMPRuntimeCallsDeduplicated = {"openmp-opt" , "NumOpenMPRuntimeCallsDeduplicated", "Number of OpenMP runtime calls deduplicated" }; | ||||||
77 | STATISTIC(NumOpenMPParallelRegionsDeleted,static llvm::Statistic NumOpenMPParallelRegionsDeleted = {"openmp-opt" , "NumOpenMPParallelRegionsDeleted", "Number of OpenMP parallel regions deleted" } | ||||||
78 | "Number of OpenMP parallel regions deleted")static llvm::Statistic NumOpenMPParallelRegionsDeleted = {"openmp-opt" , "NumOpenMPParallelRegionsDeleted", "Number of OpenMP parallel regions deleted" }; | ||||||
79 | STATISTIC(NumOpenMPRuntimeFunctionsIdentified,static llvm::Statistic NumOpenMPRuntimeFunctionsIdentified = { "openmp-opt", "NumOpenMPRuntimeFunctionsIdentified", "Number of OpenMP runtime functions identified" } | ||||||
80 | "Number of OpenMP runtime functions identified")static llvm::Statistic NumOpenMPRuntimeFunctionsIdentified = { "openmp-opt", "NumOpenMPRuntimeFunctionsIdentified", "Number of OpenMP runtime functions identified" }; | ||||||
81 | STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,static llvm::Statistic NumOpenMPRuntimeFunctionUsesIdentified = {"openmp-opt", "NumOpenMPRuntimeFunctionUsesIdentified", "Number of OpenMP runtime function uses identified" } | ||||||
82 | "Number of OpenMP runtime function uses identified")static llvm::Statistic NumOpenMPRuntimeFunctionUsesIdentified = {"openmp-opt", "NumOpenMPRuntimeFunctionUsesIdentified", "Number of OpenMP runtime function uses identified" }; | ||||||
83 | STATISTIC(NumOpenMPTargetRegionKernels,static llvm::Statistic NumOpenMPTargetRegionKernels = {"openmp-opt" , "NumOpenMPTargetRegionKernels", "Number of OpenMP target region entry points (=kernels) identified" } | ||||||
84 | "Number of OpenMP target region entry points (=kernels) identified")static llvm::Statistic NumOpenMPTargetRegionKernels = {"openmp-opt" , "NumOpenMPTargetRegionKernels", "Number of OpenMP target region entry points (=kernels) identified" }; | ||||||
85 | STATISTIC(NumOpenMPTargetRegionKernelsSPMD,static llvm::Statistic NumOpenMPTargetRegionKernelsSPMD = {"openmp-opt" , "NumOpenMPTargetRegionKernelsSPMD", "Number of OpenMP target region entry points (=kernels) executed in " "SPMD-mode instead of generic-mode"} | ||||||
86 | "Number of OpenMP target region entry points (=kernels) executed in "static llvm::Statistic NumOpenMPTargetRegionKernelsSPMD = {"openmp-opt" , "NumOpenMPTargetRegionKernelsSPMD", "Number of OpenMP target region entry points (=kernels) executed in " "SPMD-mode instead of generic-mode"} | ||||||
87 | "SPMD-mode instead of generic-mode")static llvm::Statistic NumOpenMPTargetRegionKernelsSPMD = {"openmp-opt" , "NumOpenMPTargetRegionKernelsSPMD", "Number of OpenMP target region entry points (=kernels) executed in " "SPMD-mode instead of generic-mode"}; | ||||||
88 | STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,static llvm::Statistic NumOpenMPTargetRegionKernelsWithoutStateMachine = {"openmp-opt", "NumOpenMPTargetRegionKernelsWithoutStateMachine" , "Number of OpenMP target region entry points (=kernels) executed in " "generic-mode without a state machines"} | ||||||
89 | "Number of OpenMP target region entry points (=kernels) executed in "static llvm::Statistic NumOpenMPTargetRegionKernelsWithoutStateMachine = {"openmp-opt", "NumOpenMPTargetRegionKernelsWithoutStateMachine" , "Number of OpenMP target region entry points (=kernels) executed in " "generic-mode without a state machines"} | ||||||
90 | "generic-mode without a state machines")static llvm::Statistic NumOpenMPTargetRegionKernelsWithoutStateMachine = {"openmp-opt", "NumOpenMPTargetRegionKernelsWithoutStateMachine" , "Number of OpenMP target region entry points (=kernels) executed in " "generic-mode without a state machines"}; | ||||||
91 | STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback = {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback" , "Number of OpenMP target region entry points (=kernels) executed in " "generic-mode with customized state machines with fallback"} | ||||||
92 | "Number of OpenMP target region entry points (=kernels) executed in "static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback = {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback" , "Number of OpenMP target region entry points (=kernels) executed in " "generic-mode with customized state machines with fallback"} | ||||||
93 | "generic-mode with customized state machines with fallback")static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback = {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback" , "Number of OpenMP target region entry points (=kernels) executed in " "generic-mode with customized state machines with fallback"}; | ||||||
94 | STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback = {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback" , "Number of OpenMP target region entry points (=kernels) executed in " "generic-mode with customized state machines without fallback" } | ||||||
95 | "Number of OpenMP target region entry points (=kernels) executed in "static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback = {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback" , "Number of OpenMP target region entry points (=kernels) executed in " "generic-mode with customized state machines without fallback" } | ||||||
96 | "generic-mode with customized state machines without fallback")static llvm::Statistic NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback = {"openmp-opt", "NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback" , "Number of OpenMP target region entry points (=kernels) executed in " "generic-mode with customized state machines without fallback" }; | ||||||
97 | STATISTIC(static llvm::Statistic NumOpenMPParallelRegionsReplacedInGPUStateMachine = {"openmp-opt", "NumOpenMPParallelRegionsReplacedInGPUStateMachine" , "Number of OpenMP parallel regions replaced with ID in GPU state machines" } | ||||||
98 | NumOpenMPParallelRegionsReplacedInGPUStateMachine,static llvm::Statistic NumOpenMPParallelRegionsReplacedInGPUStateMachine = {"openmp-opt", "NumOpenMPParallelRegionsReplacedInGPUStateMachine" , "Number of OpenMP parallel regions replaced with ID in GPU state machines" } | ||||||
99 | "Number of OpenMP parallel regions replaced with ID in GPU state machines")static llvm::Statistic NumOpenMPParallelRegionsReplacedInGPUStateMachine = {"openmp-opt", "NumOpenMPParallelRegionsReplacedInGPUStateMachine" , "Number of OpenMP parallel regions replaced with ID in GPU state machines" }; | ||||||
100 | STATISTIC(NumOpenMPParallelRegionsMerged,static llvm::Statistic NumOpenMPParallelRegionsMerged = {"openmp-opt" , "NumOpenMPParallelRegionsMerged", "Number of OpenMP parallel regions merged" } | ||||||
101 | "Number of OpenMP parallel regions merged")static llvm::Statistic NumOpenMPParallelRegionsMerged = {"openmp-opt" , "NumOpenMPParallelRegionsMerged", "Number of OpenMP parallel regions merged" }; | ||||||
102 | STATISTIC(NumBytesMovedToSharedMemory,static llvm::Statistic NumBytesMovedToSharedMemory = {"openmp-opt" , "NumBytesMovedToSharedMemory", "Amount of memory pushed to shared memory" } | ||||||
103 | "Amount of memory pushed to shared memory")static llvm::Statistic NumBytesMovedToSharedMemory = {"openmp-opt" , "NumBytesMovedToSharedMemory", "Amount of memory pushed to shared memory" }; | ||||||
104 | |||||||
105 | #if !defined(NDEBUG1) | ||||||
106 | static constexpr auto TAG = "[" DEBUG_TYPE"openmp-opt" "]"; | ||||||
107 | #endif | ||||||
108 | |||||||
109 | namespace { | ||||||
110 | |||||||
111 | enum class AddressSpace : unsigned { | ||||||
112 | Generic = 0, | ||||||
113 | Global = 1, | ||||||
114 | Shared = 3, | ||||||
115 | Constant = 4, | ||||||
116 | Local = 5, | ||||||
117 | }; | ||||||
118 | |||||||
119 | struct AAHeapToShared; | ||||||
120 | |||||||
121 | struct AAICVTracker; | ||||||
122 | |||||||
123 | /// OpenMP specific information. For now, stores RFIs and ICVs also needed for | ||||||
124 | /// Attributor runs. | ||||||
125 | struct OMPInformationCache : public InformationCache { | ||||||
126 | OMPInformationCache(Module &M, AnalysisGetter &AG, | ||||||
127 | BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC, | ||||||
128 | SmallPtrSetImpl<Kernel> &Kernels) | ||||||
129 | : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M), | ||||||
130 | Kernels(Kernels) { | ||||||
131 | |||||||
132 | OMPBuilder.initialize(); | ||||||
133 | initializeRuntimeFunctions(); | ||||||
134 | initializeInternalControlVars(); | ||||||
135 | } | ||||||
136 | |||||||
137 | /// Generic information that describes an internal control variable. | ||||||
138 | struct InternalControlVarInfo { | ||||||
139 | /// The kind, as described by InternalControlVar enum. | ||||||
140 | InternalControlVar Kind; | ||||||
141 | |||||||
142 | /// The name of the ICV. | ||||||
143 | StringRef Name; | ||||||
144 | |||||||
145 | /// Environment variable associated with this ICV. | ||||||
146 | StringRef EnvVarName; | ||||||
147 | |||||||
148 | /// Initial value kind. | ||||||
149 | ICVInitValue InitKind; | ||||||
150 | |||||||
151 | /// Initial value. | ||||||
152 | ConstantInt *InitValue; | ||||||
153 | |||||||
154 | /// Setter RTL function associated with this ICV. | ||||||
155 | RuntimeFunction Setter; | ||||||
156 | |||||||
157 | /// Getter RTL function associated with this ICV. | ||||||
158 | RuntimeFunction Getter; | ||||||
159 | |||||||
160 | /// RTL Function corresponding to the override clause of this ICV | ||||||
161 | RuntimeFunction Clause; | ||||||
162 | }; | ||||||
163 | |||||||
164 | /// Generic information that describes a runtime function | ||||||
165 | struct RuntimeFunctionInfo { | ||||||
166 | |||||||
167 | /// The kind, as described by the RuntimeFunction enum. | ||||||
168 | RuntimeFunction Kind; | ||||||
169 | |||||||
170 | /// The name of the function. | ||||||
171 | StringRef Name; | ||||||
172 | |||||||
173 | /// Flag to indicate a variadic function. | ||||||
174 | bool IsVarArg; | ||||||
175 | |||||||
176 | /// The return type of the function. | ||||||
177 | Type *ReturnType; | ||||||
178 | |||||||
179 | /// The argument types of the function. | ||||||
180 | SmallVector<Type *, 8> ArgumentTypes; | ||||||
181 | |||||||
182 | /// The declaration if available. | ||||||
183 | Function *Declaration = nullptr; | ||||||
184 | |||||||
185 | /// Uses of this runtime function per function containing the use. | ||||||
186 | using UseVector = SmallVector<Use *, 16>; | ||||||
187 | |||||||
188 | /// Clear UsesMap for runtime function. | ||||||
189 | void clearUsesMap() { UsesMap.clear(); } | ||||||
190 | |||||||
191 | /// Boolean conversion that is true if the runtime function was found. | ||||||
192 | operator bool() const { return Declaration; } | ||||||
193 | |||||||
194 | /// Return the vector of uses in function \p F. | ||||||
195 | UseVector &getOrCreateUseVector(Function *F) { | ||||||
196 | std::shared_ptr<UseVector> &UV = UsesMap[F]; | ||||||
197 | if (!UV) | ||||||
198 | UV = std::make_shared<UseVector>(); | ||||||
199 | return *UV; | ||||||
200 | } | ||||||
201 | |||||||
202 | /// Return the vector of uses in function \p F or `nullptr` if there are | ||||||
203 | /// none. | ||||||
204 | const UseVector *getUseVector(Function &F) const { | ||||||
205 | auto I = UsesMap.find(&F); | ||||||
206 | if (I != UsesMap.end()) | ||||||
207 | return I->second.get(); | ||||||
208 | return nullptr; | ||||||
209 | } | ||||||
210 | |||||||
211 | /// Return how many functions contain uses of this runtime function. | ||||||
212 | size_t getNumFunctionsWithUses() const { return UsesMap.size(); } | ||||||
213 | |||||||
214 | /// Return the number of arguments (or the minimal number for variadic | ||||||
215 | /// functions). | ||||||
216 | size_t getNumArgs() const { return ArgumentTypes.size(); } | ||||||
217 | |||||||
218 | /// Run the callback \p CB on each use and forget the use if the result is | ||||||
219 | /// true. The callback will be fed the function in which the use was | ||||||
220 | /// encountered as second argument. | ||||||
221 | void foreachUse(SmallVectorImpl<Function *> &SCC, | ||||||
222 | function_ref<bool(Use &, Function &)> CB) { | ||||||
223 | for (Function *F : SCC) | ||||||
224 | foreachUse(CB, F); | ||||||
225 | } | ||||||
226 | |||||||
227 | /// Run the callback \p CB on each use within the function \p F and forget | ||||||
228 | /// the use if the result is true. | ||||||
229 | void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) { | ||||||
230 | SmallVector<unsigned, 8> ToBeDeleted; | ||||||
231 | ToBeDeleted.clear(); | ||||||
232 | |||||||
233 | unsigned Idx = 0; | ||||||
234 | UseVector &UV = getOrCreateUseVector(F); | ||||||
235 | |||||||
236 | for (Use *U : UV) { | ||||||
237 | if (CB(*U, *F)) | ||||||
238 | ToBeDeleted.push_back(Idx); | ||||||
239 | ++Idx; | ||||||
240 | } | ||||||
241 | |||||||
242 | // Remove the to-be-deleted indices in reverse order as prior | ||||||
243 | // modifications will not modify the smaller indices. | ||||||
244 | while (!ToBeDeleted.empty()) { | ||||||
245 | unsigned Idx = ToBeDeleted.pop_back_val(); | ||||||
246 | UV[Idx] = UV.back(); | ||||||
247 | UV.pop_back(); | ||||||
248 | } | ||||||
249 | } | ||||||
250 | |||||||
251 | private: | ||||||
252 | /// Map from functions to all uses of this runtime function contained in | ||||||
253 | /// them. | ||||||
254 | DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap; | ||||||
255 | |||||||
256 | public: | ||||||
257 | /// Iterators for the uses of this runtime function. | ||||||
258 | decltype(UsesMap)::iterator begin() { return UsesMap.begin(); } | ||||||
259 | decltype(UsesMap)::iterator end() { return UsesMap.end(); } | ||||||
260 | }; | ||||||
261 | |||||||
262 | /// An OpenMP-IR-Builder instance | ||||||
263 | OpenMPIRBuilder OMPBuilder; | ||||||
264 | |||||||
265 | /// Map from runtime function kind to the runtime function description. | ||||||
266 | EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction, | ||||||
267 | RuntimeFunction::OMPRTL___last> | ||||||
268 | RFIs; | ||||||
269 | |||||||
270 | /// Map from function declarations/definitions to their runtime enum type. | ||||||
271 | DenseMap<Function *, RuntimeFunction> RuntimeFunctionIDMap; | ||||||
272 | |||||||
273 | /// Map from ICV kind to the ICV description. | ||||||
274 | EnumeratedArray<InternalControlVarInfo, InternalControlVar, | ||||||
275 | InternalControlVar::ICV___last> | ||||||
276 | ICVs; | ||||||
277 | |||||||
278 | /// Helper to initialize all internal control variable information for those | ||||||
279 | /// defined in OMPKinds.def. | ||||||
280 | void initializeInternalControlVars() { | ||||||
281 | #define ICV_RT_SET(_Name, RTL) \ | ||||||
282 | { \ | ||||||
283 | auto &ICV = ICVs[_Name]; \ | ||||||
284 | ICV.Setter = RTL; \ | ||||||
285 | } | ||||||
286 | #define ICV_RT_GET(Name, RTL) \ | ||||||
287 | { \ | ||||||
288 | auto &ICV = ICVs[Name]; \ | ||||||
289 | ICV.Getter = RTL; \ | ||||||
290 | } | ||||||
291 | #define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \ | ||||||
292 | { \ | ||||||
293 | auto &ICV = ICVs[Enum]; \ | ||||||
294 | ICV.Name = _Name; \ | ||||||
295 | ICV.Kind = Enum; \ | ||||||
296 | ICV.InitKind = Init; \ | ||||||
297 | ICV.EnvVarName = _EnvVarName; \ | ||||||
298 | switch (ICV.InitKind) { \ | ||||||
299 | case ICV_IMPLEMENTATION_DEFINED: \ | ||||||
300 | ICV.InitValue = nullptr; \ | ||||||
301 | break; \ | ||||||
302 | case ICV_ZERO: \ | ||||||
303 | ICV.InitValue = ConstantInt::get( \ | ||||||
304 | Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \ | ||||||
305 | break; \ | ||||||
306 | case ICV_FALSE: \ | ||||||
307 | ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \ | ||||||
308 | break; \ | ||||||
309 | case ICV_LAST: \ | ||||||
310 | break; \ | ||||||
311 | } \ | ||||||
312 | } | ||||||
313 | #include "llvm/Frontend/OpenMP/OMPKinds.def" | ||||||
314 | } | ||||||
315 | |||||||
316 | /// Returns true if the function declaration \p F matches the runtime | ||||||
317 | /// function types, that is, return type \p RTFRetType, and argument types | ||||||
318 | /// \p RTFArgTypes. | ||||||
319 | static bool declMatchesRTFTypes(Function *F, Type *RTFRetType, | ||||||
320 | SmallVector<Type *, 8> &RTFArgTypes) { | ||||||
321 | // TODO: We should output information to the user (under debug output | ||||||
322 | // and via remarks). | ||||||
323 | |||||||
324 | if (!F) | ||||||
325 | return false; | ||||||
326 | if (F->getReturnType() != RTFRetType) | ||||||
327 | return false; | ||||||
328 | if (F->arg_size() != RTFArgTypes.size()) | ||||||
329 | return false; | ||||||
330 | |||||||
331 | auto RTFTyIt = RTFArgTypes.begin(); | ||||||
332 | for (Argument &Arg : F->args()) { | ||||||
333 | if (Arg.getType() != *RTFTyIt) | ||||||
334 | return false; | ||||||
335 | |||||||
336 | ++RTFTyIt; | ||||||
337 | } | ||||||
338 | |||||||
339 | return true; | ||||||
340 | } | ||||||
341 | |||||||
342 | // Helper to collect all uses of the declaration in the UsesMap. | ||||||
343 | unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) { | ||||||
344 | unsigned NumUses = 0; | ||||||
345 | if (!RFI.Declaration) | ||||||
346 | return NumUses; | ||||||
347 | OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration); | ||||||
348 | |||||||
349 | if (CollectStats) { | ||||||
350 | NumOpenMPRuntimeFunctionsIdentified += 1; | ||||||
351 | NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses(); | ||||||
352 | } | ||||||
353 | |||||||
354 | // TODO: We directly convert uses into proper calls and unknown uses. | ||||||
355 | for (Use &U : RFI.Declaration->uses()) { | ||||||
356 | if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) { | ||||||
357 | if (ModuleSlice.count(UserI->getFunction())) { | ||||||
358 | RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U); | ||||||
359 | ++NumUses; | ||||||
360 | } | ||||||
361 | } else { | ||||||
362 | RFI.getOrCreateUseVector(nullptr).push_back(&U); | ||||||
363 | ++NumUses; | ||||||
364 | } | ||||||
365 | } | ||||||
366 | return NumUses; | ||||||
367 | } | ||||||
368 | |||||||
369 | // Helper function to recollect uses of a runtime function. | ||||||
370 | void recollectUsesForFunction(RuntimeFunction RTF) { | ||||||
371 | auto &RFI = RFIs[RTF]; | ||||||
372 | RFI.clearUsesMap(); | ||||||
373 | collectUses(RFI, /*CollectStats*/ false); | ||||||
374 | } | ||||||
375 | |||||||
376 | // Helper function to recollect uses of all runtime functions. | ||||||
377 | void recollectUses() { | ||||||
378 | for (int Idx = 0; Idx < RFIs.size(); ++Idx) | ||||||
379 | recollectUsesForFunction(static_cast<RuntimeFunction>(Idx)); | ||||||
380 | } | ||||||
381 | |||||||
382 | /// Helper to initialize all runtime function information for those defined | ||||||
383 | /// in OpenMPKinds.def. | ||||||
384 | void initializeRuntimeFunctions() { | ||||||
385 | Module &M = *((*ModuleSlice.begin())->getParent()); | ||||||
386 | |||||||
387 | // Helper macros for handling __VA_ARGS__ in OMP_RTL | ||||||
388 | #define OMP_TYPE(VarName, ...) \ | ||||||
389 | Type *VarName = OMPBuilder.VarName; \ | ||||||
390 | (void)VarName; | ||||||
391 | |||||||
392 | #define OMP_ARRAY_TYPE(VarName, ...) \ | ||||||
393 | ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \ | ||||||
394 | (void)VarName##Ty; \ | ||||||
395 | PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \ | ||||||
396 | (void)VarName##PtrTy; | ||||||
397 | |||||||
398 | #define OMP_FUNCTION_TYPE(VarName, ...) \ | ||||||
399 | FunctionType *VarName = OMPBuilder.VarName; \ | ||||||
400 | (void)VarName; \ | ||||||
401 | PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \ | ||||||
402 | (void)VarName##Ptr; | ||||||
403 | |||||||
404 | #define OMP_STRUCT_TYPE(VarName, ...) \ | ||||||
405 | StructType *VarName = OMPBuilder.VarName; \ | ||||||
406 | (void)VarName; \ | ||||||
407 | PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \ | ||||||
408 | (void)VarName##Ptr; | ||||||
409 | |||||||
410 | #define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \ | ||||||
411 | { \ | ||||||
412 | SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \ | ||||||
413 | Function *F = M.getFunction(_Name); \ | ||||||
414 | RTLFunctions.insert(F); \ | ||||||
415 | if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \ | ||||||
416 | RuntimeFunctionIDMap[F] = _Enum; \ | ||||||
417 | F->removeFnAttr(Attribute::NoInline); \ | ||||||
418 | auto &RFI = RFIs[_Enum]; \ | ||||||
419 | RFI.Kind = _Enum; \ | ||||||
420 | RFI.Name = _Name; \ | ||||||
421 | RFI.IsVarArg = _IsVarArg; \ | ||||||
422 | RFI.ReturnType = OMPBuilder._ReturnType; \ | ||||||
423 | RFI.ArgumentTypes = std::move(ArgsTypes); \ | ||||||
424 | RFI.Declaration = F; \ | ||||||
425 | unsigned NumUses = collectUses(RFI); \ | ||||||
426 | (void)NumUses; \ | ||||||
427 | LLVM_DEBUG({ \do { } while (false) | ||||||
428 | dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \do { } while (false) | ||||||
429 | << " found\n"; \do { } while (false) | ||||||
430 | if (RFI.Declaration) \do { } while (false) | ||||||
431 | dbgs() << TAG << "-> got " << NumUses << " uses in " \do { } while (false) | ||||||
432 | << RFI.getNumFunctionsWithUses() \do { } while (false) | ||||||
433 | << " different functions.\n"; \do { } while (false) | ||||||
434 | })do { } while (false); \ | ||||||
435 | } \ | ||||||
436 | } | ||||||
437 | #include "llvm/Frontend/OpenMP/OMPKinds.def" | ||||||
438 | |||||||
439 | // TODO: We should attach the attributes defined in OMPKinds.def. | ||||||
440 | } | ||||||
441 | |||||||
442 | /// Collection of known kernels (\see Kernel) in the module. | ||||||
443 | SmallPtrSetImpl<Kernel> &Kernels; | ||||||
444 | |||||||
445 | /// Collection of known OpenMP runtime functions.. | ||||||
446 | DenseSet<const Function *> RTLFunctions; | ||||||
447 | }; | ||||||
448 | |||||||
449 | template <typename Ty, bool InsertInvalidates = true> | ||||||
450 | struct BooleanStateWithSetVector : public BooleanState { | ||||||
451 | bool contains(const Ty &Elem) const { return Set.contains(Elem); } | ||||||
452 | bool insert(const Ty &Elem) { | ||||||
453 | if (InsertInvalidates) | ||||||
454 | BooleanState::indicatePessimisticFixpoint(); | ||||||
455 | return Set.insert(Elem); | ||||||
456 | } | ||||||
457 | |||||||
458 | const Ty &operator[](int Idx) const { return Set[Idx]; } | ||||||
459 | bool operator==(const BooleanStateWithSetVector &RHS) const { | ||||||
460 | return BooleanState::operator==(RHS) && Set == RHS.Set; | ||||||
461 | } | ||||||
462 | bool operator!=(const BooleanStateWithSetVector &RHS) const { | ||||||
463 | return !(*this == RHS); | ||||||
464 | } | ||||||
465 | |||||||
466 | bool empty() const { return Set.empty(); } | ||||||
467 | size_t size() const { return Set.size(); } | ||||||
468 | |||||||
469 | /// "Clamp" this state with \p RHS. | ||||||
470 | BooleanStateWithSetVector &operator^=(const BooleanStateWithSetVector &RHS) { | ||||||
471 | BooleanState::operator^=(RHS); | ||||||
472 | Set.insert(RHS.Set.begin(), RHS.Set.end()); | ||||||
473 | return *this; | ||||||
474 | } | ||||||
475 | |||||||
476 | private: | ||||||
477 | /// A set to keep track of elements. | ||||||
478 | SetVector<Ty> Set; | ||||||
479 | |||||||
480 | public: | ||||||
481 | typename decltype(Set)::iterator begin() { return Set.begin(); } | ||||||
482 | typename decltype(Set)::iterator end() { return Set.end(); } | ||||||
483 | typename decltype(Set)::const_iterator begin() const { return Set.begin(); } | ||||||
484 | typename decltype(Set)::const_iterator end() const { return Set.end(); } | ||||||
485 | }; | ||||||
486 | |||||||
487 | template <typename Ty, bool InsertInvalidates = true> | ||||||
488 | using BooleanStateWithPtrSetVector = | ||||||
489 | BooleanStateWithSetVector<Ty *, InsertInvalidates>; | ||||||
490 | |||||||
491 | struct KernelInfoState : AbstractState { | ||||||
492 | /// Flag to track if we reached a fixpoint. | ||||||
493 | bool IsAtFixpoint = false; | ||||||
494 | |||||||
495 | /// The parallel regions (identified by the outlined parallel functions) that | ||||||
496 | /// can be reached from the associated function. | ||||||
497 | BooleanStateWithPtrSetVector<Function, /* InsertInvalidates */ false> | ||||||
498 | ReachedKnownParallelRegions; | ||||||
499 | |||||||
500 | /// State to track what parallel region we might reach. | ||||||
501 | BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions; | ||||||
502 | |||||||
503 | /// State to track if we are in SPMD-mode, assumed or know, and why we decided | ||||||
504 | /// we cannot be. If it is assumed, then RequiresFullRuntime should also be | ||||||
505 | /// false. | ||||||
506 | BooleanStateWithPtrSetVector<Instruction> SPMDCompatibilityTracker; | ||||||
507 | |||||||
508 | /// The __kmpc_target_init call in this kernel, if any. If we find more than | ||||||
509 | /// one we abort as the kernel is malformed. | ||||||
510 | CallBase *KernelInitCB = nullptr; | ||||||
511 | |||||||
512 | /// The __kmpc_target_deinit call in this kernel, if any. If we find more than | ||||||
513 | /// one we abort as the kernel is malformed. | ||||||
514 | CallBase *KernelDeinitCB = nullptr; | ||||||
515 | |||||||
516 | /// Flag to indicate if the associated function is a kernel entry. | ||||||
517 | bool IsKernelEntry = false; | ||||||
518 | |||||||
519 | /// State to track what kernel entries can reach the associated function. | ||||||
520 | BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries; | ||||||
521 | |||||||
522 | /// State to indicate if we can track parallel level of the associated | ||||||
523 | /// function. We will give up tracking if we encounter unknown caller or the | ||||||
524 | /// caller is __kmpc_parallel_51. | ||||||
525 | BooleanStateWithSetVector<uint8_t> ParallelLevels; | ||||||
526 | |||||||
527 | /// Abstract State interface | ||||||
528 | ///{ | ||||||
529 | |||||||
530 | KernelInfoState() {} | ||||||
531 | KernelInfoState(bool BestState) { | ||||||
532 | if (!BestState) | ||||||
533 | indicatePessimisticFixpoint(); | ||||||
534 | } | ||||||
535 | |||||||
536 | /// See AbstractState::isValidState(...) | ||||||
537 | bool isValidState() const override { return true; } | ||||||
538 | |||||||
539 | /// See AbstractState::isAtFixpoint(...) | ||||||
540 | bool isAtFixpoint() const override { return IsAtFixpoint; } | ||||||
541 | |||||||
542 | /// See AbstractState::indicatePessimisticFixpoint(...) | ||||||
543 | ChangeStatus indicatePessimisticFixpoint() override { | ||||||
544 | IsAtFixpoint = true; | ||||||
545 | SPMDCompatibilityTracker.indicatePessimisticFixpoint(); | ||||||
546 | ReachedUnknownParallelRegions.indicatePessimisticFixpoint(); | ||||||
547 | return ChangeStatus::CHANGED; | ||||||
548 | } | ||||||
549 | |||||||
550 | /// See AbstractState::indicateOptimisticFixpoint(...) | ||||||
551 | ChangeStatus indicateOptimisticFixpoint() override { | ||||||
552 | IsAtFixpoint = true; | ||||||
553 | return ChangeStatus::UNCHANGED; | ||||||
554 | } | ||||||
555 | |||||||
556 | /// Return the assumed state | ||||||
557 | KernelInfoState &getAssumed() { return *this; } | ||||||
558 | const KernelInfoState &getAssumed() const { return *this; } | ||||||
559 | |||||||
560 | bool operator==(const KernelInfoState &RHS) const { | ||||||
561 | if (SPMDCompatibilityTracker != RHS.SPMDCompatibilityTracker) | ||||||
562 | return false; | ||||||
563 | if (ReachedKnownParallelRegions != RHS.ReachedKnownParallelRegions) | ||||||
564 | return false; | ||||||
565 | if (ReachedUnknownParallelRegions != RHS.ReachedUnknownParallelRegions) | ||||||
566 | return false; | ||||||
567 | if (ReachingKernelEntries != RHS.ReachingKernelEntries) | ||||||
568 | return false; | ||||||
569 | return true; | ||||||
570 | } | ||||||
571 | |||||||
572 | /// Return empty set as the best state of potential values. | ||||||
573 | static KernelInfoState getBestState() { return KernelInfoState(true); } | ||||||
574 | |||||||
575 | static KernelInfoState getBestState(KernelInfoState &KIS) { | ||||||
576 | return getBestState(); | ||||||
577 | } | ||||||
578 | |||||||
579 | /// Return full set as the worst state of potential values. | ||||||
580 | static KernelInfoState getWorstState() { return KernelInfoState(false); } | ||||||
581 | |||||||
582 | /// "Clamp" this state with \p KIS. | ||||||
583 | KernelInfoState operator^=(const KernelInfoState &KIS) { | ||||||
584 | // Do not merge two different _init and _deinit call sites. | ||||||
585 | if (KIS.KernelInitCB) { | ||||||
586 | if (KernelInitCB && KernelInitCB != KIS.KernelInitCB) | ||||||
587 | indicatePessimisticFixpoint(); | ||||||
588 | KernelInitCB = KIS.KernelInitCB; | ||||||
589 | } | ||||||
590 | if (KIS.KernelDeinitCB) { | ||||||
591 | if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB) | ||||||
592 | indicatePessimisticFixpoint(); | ||||||
593 | KernelDeinitCB = KIS.KernelDeinitCB; | ||||||
594 | } | ||||||
595 | SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker; | ||||||
596 | ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions; | ||||||
597 | ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions; | ||||||
598 | return *this; | ||||||
599 | } | ||||||
600 | |||||||
601 | KernelInfoState operator&=(const KernelInfoState &KIS) { | ||||||
602 | return (*this ^= KIS); | ||||||
603 | } | ||||||
604 | |||||||
605 | ///} | ||||||
606 | }; | ||||||
607 | |||||||
608 | /// Used to map the values physically (in the IR) stored in an offload | ||||||
609 | /// array, to a vector in memory. | ||||||
610 | struct OffloadArray { | ||||||
611 | /// Physical array (in the IR). | ||||||
612 | AllocaInst *Array = nullptr; | ||||||
613 | /// Mapped values. | ||||||
614 | SmallVector<Value *, 8> StoredValues; | ||||||
615 | /// Last stores made in the offload array. | ||||||
616 | SmallVector<StoreInst *, 8> LastAccesses; | ||||||
617 | |||||||
618 | OffloadArray() = default; | ||||||
619 | |||||||
620 | /// Initializes the OffloadArray with the values stored in \p Array before | ||||||
621 | /// instruction \p Before is reached. Returns false if the initialization | ||||||
622 | /// fails. | ||||||
623 | /// This MUST be used immediately after the construction of the object. | ||||||
624 | bool initialize(AllocaInst &Array, Instruction &Before) { | ||||||
625 | if (!Array.getAllocatedType()->isArrayTy()) | ||||||
626 | return false; | ||||||
627 | |||||||
628 | if (!getValues(Array, Before)) | ||||||
629 | return false; | ||||||
630 | |||||||
631 | this->Array = &Array; | ||||||
632 | return true; | ||||||
633 | } | ||||||
634 | |||||||
635 | static const unsigned DeviceIDArgNum = 1; | ||||||
636 | static const unsigned BasePtrsArgNum = 3; | ||||||
637 | static const unsigned PtrsArgNum = 4; | ||||||
638 | static const unsigned SizesArgNum = 5; | ||||||
639 | |||||||
640 | private: | ||||||
641 | /// Traverses the BasicBlock where \p Array is, collecting the stores made to | ||||||
642 | /// \p Array, leaving StoredValues with the values stored before the | ||||||
643 | /// instruction \p Before is reached. | ||||||
644 | bool getValues(AllocaInst &Array, Instruction &Before) { | ||||||
645 | // Initialize container. | ||||||
646 | const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements(); | ||||||
647 | StoredValues.assign(NumValues, nullptr); | ||||||
648 | LastAccesses.assign(NumValues, nullptr); | ||||||
649 | |||||||
650 | // TODO: This assumes the instruction \p Before is in the same | ||||||
651 | // BasicBlock as Array. Make it general, for any control flow graph. | ||||||
652 | BasicBlock *BB = Array.getParent(); | ||||||
653 | if (BB != Before.getParent()) | ||||||
654 | return false; | ||||||
655 | |||||||
656 | const DataLayout &DL = Array.getModule()->getDataLayout(); | ||||||
657 | const unsigned int PointerSize = DL.getPointerSize(); | ||||||
658 | |||||||
659 | for (Instruction &I : *BB) { | ||||||
660 | if (&I == &Before) | ||||||
661 | break; | ||||||
662 | |||||||
663 | if (!isa<StoreInst>(&I)) | ||||||
664 | continue; | ||||||
665 | |||||||
666 | auto *S = cast<StoreInst>(&I); | ||||||
667 | int64_t Offset = -1; | ||||||
668 | auto *Dst = | ||||||
669 | GetPointerBaseWithConstantOffset(S->getPointerOperand(), Offset, DL); | ||||||
670 | if (Dst == &Array) { | ||||||
671 | int64_t Idx = Offset / PointerSize; | ||||||
672 | StoredValues[Idx] = getUnderlyingObject(S->getValueOperand()); | ||||||
673 | LastAccesses[Idx] = S; | ||||||
674 | } | ||||||
675 | } | ||||||
676 | |||||||
677 | return isFilled(); | ||||||
678 | } | ||||||
679 | |||||||
680 | /// Returns true if all values in StoredValues and | ||||||
681 | /// LastAccesses are not nullptrs. | ||||||
682 | bool isFilled() { | ||||||
683 | const unsigned NumValues = StoredValues.size(); | ||||||
684 | for (unsigned I = 0; I < NumValues; ++I) { | ||||||
685 | if (!StoredValues[I] || !LastAccesses[I]) | ||||||
686 | return false; | ||||||
687 | } | ||||||
688 | |||||||
689 | return true; | ||||||
690 | } | ||||||
691 | }; | ||||||
692 | |||||||
693 | struct OpenMPOpt { | ||||||
694 | |||||||
695 | using OptimizationRemarkGetter = | ||||||
696 | function_ref<OptimizationRemarkEmitter &(Function *)>; | ||||||
697 | |||||||
698 | OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater, | ||||||
699 | OptimizationRemarkGetter OREGetter, | ||||||
700 | OMPInformationCache &OMPInfoCache, Attributor &A) | ||||||
701 | : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater), | ||||||
702 | OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {} | ||||||
703 | |||||||
704 | /// Check if any remarks are enabled for openmp-opt | ||||||
705 | bool remarksEnabled() { | ||||||
706 | auto &Ctx = M.getContext(); | ||||||
707 | return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE"openmp-opt"); | ||||||
708 | } | ||||||
709 | |||||||
710 | /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice. | ||||||
711 | bool run(bool IsModulePass) { | ||||||
712 | if (SCC.empty()) | ||||||
713 | return false; | ||||||
714 | |||||||
715 | bool Changed = false; | ||||||
716 | |||||||
717 | LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()do { } while (false) | ||||||
718 | << " functions in a slice with "do { } while (false) | ||||||
719 | << OMPInfoCache.ModuleSlice.size() << " functions\n")do { } while (false); | ||||||
720 | |||||||
721 | if (IsModulePass) { | ||||||
722 | Changed |= runAttributor(IsModulePass); | ||||||
723 | |||||||
724 | // Recollect uses, in case Attributor deleted any. | ||||||
725 | OMPInfoCache.recollectUses(); | ||||||
726 | |||||||
727 | // TODO: This should be folded into buildCustomStateMachine. | ||||||
728 | Changed |= rewriteDeviceCodeStateMachine(); | ||||||
729 | |||||||
730 | if (remarksEnabled()) | ||||||
731 | analysisGlobalization(); | ||||||
732 | } else { | ||||||
733 | if (PrintICVValues) | ||||||
734 | printICVs(); | ||||||
735 | if (PrintOpenMPKernels) | ||||||
736 | printKernels(); | ||||||
737 | |||||||
738 | Changed |= runAttributor(IsModulePass); | ||||||
739 | |||||||
740 | // Recollect uses, in case Attributor deleted any. | ||||||
741 | OMPInfoCache.recollectUses(); | ||||||
742 | |||||||
743 | Changed |= deleteParallelRegions(); | ||||||
744 | |||||||
745 | if (HideMemoryTransferLatency) | ||||||
746 | Changed |= hideMemTransfersLatency(); | ||||||
747 | Changed |= deduplicateRuntimeCalls(); | ||||||
748 | if (EnableParallelRegionMerging) { | ||||||
749 | if (mergeParallelRegions()) { | ||||||
750 | deduplicateRuntimeCalls(); | ||||||
751 | Changed = true; | ||||||
752 | } | ||||||
753 | } | ||||||
754 | } | ||||||
755 | |||||||
756 | return Changed; | ||||||
757 | } | ||||||
758 | |||||||
759 | /// Print initial ICV values for testing. | ||||||
760 | /// FIXME: This should be done from the Attributor once it is added. | ||||||
761 | void printICVs() const { | ||||||
762 | InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel, | ||||||
763 | ICV_proc_bind}; | ||||||
764 | |||||||
765 | for (Function *F : OMPInfoCache.ModuleSlice) { | ||||||
766 | for (auto ICV : ICVs) { | ||||||
767 | auto ICVInfo = OMPInfoCache.ICVs[ICV]; | ||||||
768 | auto Remark = [&](OptimizationRemarkAnalysis ORA) { | ||||||
769 | return ORA << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name) | ||||||
770 | << " Value: " | ||||||
771 | << (ICVInfo.InitValue | ||||||
772 | ? toString(ICVInfo.InitValue->getValue(), 10, true) | ||||||
773 | : "IMPLEMENTATION_DEFINED"); | ||||||
774 | }; | ||||||
775 | |||||||
776 | emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPICVTracker", Remark); | ||||||
777 | } | ||||||
778 | } | ||||||
779 | } | ||||||
780 | |||||||
781 | /// Print OpenMP GPU kernels for testing. | ||||||
782 | void printKernels() const { | ||||||
783 | for (Function *F : SCC) { | ||||||
784 | if (!OMPInfoCache.Kernels.count(F)) | ||||||
785 | continue; | ||||||
786 | |||||||
787 | auto Remark = [&](OptimizationRemarkAnalysis ORA) { | ||||||
788 | return ORA << "OpenMP GPU kernel " | ||||||
789 | << ore::NV("OpenMPGPUKernel", F->getName()) << "\n"; | ||||||
790 | }; | ||||||
791 | |||||||
792 | emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPGPU", Remark); | ||||||
793 | } | ||||||
794 | } | ||||||
795 | |||||||
796 | /// Return the call if \p U is a callee use in a regular call. If \p RFI is | ||||||
797 | /// given it has to be the callee or a nullptr is returned. | ||||||
798 | static CallInst *getCallIfRegularCall( | ||||||
799 | Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) { | ||||||
800 | CallInst *CI = dyn_cast<CallInst>(U.getUser()); | ||||||
801 | if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() && | ||||||
802 | (!RFI || | ||||||
803 | (RFI->Declaration && CI->getCalledFunction() == RFI->Declaration))) | ||||||
804 | return CI; | ||||||
805 | return nullptr; | ||||||
806 | } | ||||||
807 | |||||||
808 | /// Return the call if \p V is a regular call. If \p RFI is given it has to be | ||||||
809 | /// the callee or a nullptr is returned. | ||||||
810 | static CallInst *getCallIfRegularCall( | ||||||
811 | Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) { | ||||||
812 | CallInst *CI = dyn_cast<CallInst>(&V); | ||||||
813 | if (CI && !CI->hasOperandBundles() && | ||||||
814 | (!RFI || | ||||||
815 | (RFI->Declaration && CI->getCalledFunction() == RFI->Declaration))) | ||||||
816 | return CI; | ||||||
817 | return nullptr; | ||||||
818 | } | ||||||
819 | |||||||
820 | private: | ||||||
821 | /// Merge parallel regions when it is safe. | ||||||
822 | bool mergeParallelRegions() { | ||||||
823 | const unsigned CallbackCalleeOperand = 2; | ||||||
824 | const unsigned CallbackFirstArgOperand = 3; | ||||||
825 | using InsertPointTy = OpenMPIRBuilder::InsertPointTy; | ||||||
826 | |||||||
827 | // Check if there are any __kmpc_fork_call calls to merge. | ||||||
828 | OMPInformationCache::RuntimeFunctionInfo &RFI = | ||||||
829 | OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; | ||||||
830 | |||||||
831 | if (!RFI.Declaration) | ||||||
832 | return false; | ||||||
833 | |||||||
834 | // Unmergable calls that prevent merging a parallel region. | ||||||
835 | OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = { | ||||||
836 | OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind], | ||||||
837 | OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads], | ||||||
838 | }; | ||||||
839 | |||||||
840 | bool Changed = false; | ||||||
841 | LoopInfo *LI = nullptr; | ||||||
842 | DominatorTree *DT = nullptr; | ||||||
843 | |||||||
844 | SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap; | ||||||
845 | |||||||
846 | BasicBlock *StartBB = nullptr, *EndBB = nullptr; | ||||||
847 | auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, | ||||||
848 | BasicBlock &ContinuationIP) { | ||||||
849 | BasicBlock *CGStartBB = CodeGenIP.getBlock(); | ||||||
850 | BasicBlock *CGEndBB = | ||||||
851 | SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI); | ||||||
852 | assert(StartBB != nullptr && "StartBB should not be null")((void)0); | ||||||
853 | CGStartBB->getTerminator()->setSuccessor(0, StartBB); | ||||||
854 | assert(EndBB != nullptr && "EndBB should not be null")((void)0); | ||||||
855 | EndBB->getTerminator()->setSuccessor(0, CGEndBB); | ||||||
856 | }; | ||||||
857 | |||||||
858 | auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &, | ||||||
859 | Value &Inner, Value *&ReplacementValue) -> InsertPointTy { | ||||||
860 | ReplacementValue = &Inner; | ||||||
861 | return CodeGenIP; | ||||||
862 | }; | ||||||
863 | |||||||
864 | auto FiniCB = [&](InsertPointTy CodeGenIP) {}; | ||||||
865 | |||||||
866 | /// Create a sequential execution region within a merged parallel region, | ||||||
867 | /// encapsulated in a master construct with a barrier for synchronization. | ||||||
868 | auto CreateSequentialRegion = [&](Function *OuterFn, | ||||||
869 | BasicBlock *OuterPredBB, | ||||||
870 | Instruction *SeqStartI, | ||||||
871 | Instruction *SeqEndI) { | ||||||
872 | // Isolate the instructions of the sequential region to a separate | ||||||
873 | // block. | ||||||
874 | BasicBlock *ParentBB = SeqStartI->getParent(); | ||||||
875 | BasicBlock *SeqEndBB = | ||||||
876 | SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI); | ||||||
877 | BasicBlock *SeqAfterBB = | ||||||
878 | SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI); | ||||||
879 | BasicBlock *SeqStartBB = | ||||||
880 | SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged"); | ||||||
881 | |||||||
882 | assert(ParentBB->getUniqueSuccessor() == SeqStartBB &&((void)0) | ||||||
883 | "Expected a different CFG")((void)0); | ||||||
884 | const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc(); | ||||||
885 | ParentBB->getTerminator()->eraseFromParent(); | ||||||
886 | |||||||
887 | auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, | ||||||
888 | BasicBlock &ContinuationIP) { | ||||||
889 | BasicBlock *CGStartBB = CodeGenIP.getBlock(); | ||||||
890 | BasicBlock *CGEndBB = | ||||||
891 | SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI); | ||||||
892 | assert(SeqStartBB != nullptr && "SeqStartBB should not be null")((void)0); | ||||||
893 | CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB); | ||||||
894 | assert(SeqEndBB != nullptr && "SeqEndBB should not be null")((void)0); | ||||||
895 | SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB); | ||||||
896 | }; | ||||||
897 | auto FiniCB = [&](InsertPointTy CodeGenIP) {}; | ||||||
898 | |||||||
899 | // Find outputs from the sequential region to outside users and | ||||||
900 | // broadcast their values to them. | ||||||
901 | for (Instruction &I : *SeqStartBB) { | ||||||
902 | SmallPtrSet<Instruction *, 4> OutsideUsers; | ||||||
903 | for (User *Usr : I.users()) { | ||||||
904 | Instruction &UsrI = *cast<Instruction>(Usr); | ||||||
905 | // Ignore outputs to LT intrinsics, code extraction for the merged | ||||||
906 | // parallel region will fix them. | ||||||
907 | if (UsrI.isLifetimeStartOrEnd()) | ||||||
908 | continue; | ||||||
909 | |||||||
910 | if (UsrI.getParent() != SeqStartBB) | ||||||
911 | OutsideUsers.insert(&UsrI); | ||||||
912 | } | ||||||
913 | |||||||
914 | if (OutsideUsers.empty()) | ||||||
915 | continue; | ||||||
916 | |||||||
917 | // Emit an alloca in the outer region to store the broadcasted | ||||||
918 | // value. | ||||||
919 | const DataLayout &DL = M.getDataLayout(); | ||||||
920 | AllocaInst *AllocaI = new AllocaInst( | ||||||
921 | I.getType(), DL.getAllocaAddrSpace(), nullptr, | ||||||
922 | I.getName() + ".seq.output.alloc", &OuterFn->front().front()); | ||||||
923 | |||||||
924 | // Emit a store instruction in the sequential BB to update the | ||||||
925 | // value. | ||||||
926 | new StoreInst(&I, AllocaI, SeqStartBB->getTerminator()); | ||||||
927 | |||||||
928 | // Emit a load instruction and replace the use of the output value | ||||||
929 | // with it. | ||||||
930 | for (Instruction *UsrI : OutsideUsers) { | ||||||
931 | LoadInst *LoadI = new LoadInst( | ||||||
932 | I.getType(), AllocaI, I.getName() + ".seq.output.load", UsrI); | ||||||
933 | UsrI->replaceUsesOfWith(&I, LoadI); | ||||||
934 | } | ||||||
935 | } | ||||||
936 | |||||||
937 | OpenMPIRBuilder::LocationDescription Loc( | ||||||
938 | InsertPointTy(ParentBB, ParentBB->end()), DL); | ||||||
939 | InsertPointTy SeqAfterIP = | ||||||
940 | OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB); | ||||||
941 | |||||||
942 | OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel); | ||||||
943 | |||||||
944 | BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock()); | ||||||
945 | |||||||
946 | LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFndo { } while (false) | ||||||
947 | << "\n")do { } while (false); | ||||||
948 | }; | ||||||
949 | |||||||
950 | // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all | ||||||
951 | // contained in BB and only separated by instructions that can be | ||||||
952 | // redundantly executed in parallel. The block BB is split before the first | ||||||
953 | // call (in MergableCIs) and after the last so the entire region we merge | ||||||
954 | // into a single parallel region is contained in a single basic block | ||||||
955 | // without any other instructions. We use the OpenMPIRBuilder to outline | ||||||
956 | // that block and call the resulting function via __kmpc_fork_call. | ||||||
957 | auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) { | ||||||
958 | // TODO: Change the interface to allow single CIs expanded, e.g, to | ||||||
959 | // include an outer loop. | ||||||
960 | assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs")((void)0); | ||||||
961 | |||||||
962 | auto Remark = [&](OptimizationRemark OR) { | ||||||
963 | OR << "Parallel region merged with parallel region" | ||||||
964 | << (MergableCIs.size() > 2 ? "s" : "") << " at "; | ||||||
965 | for (auto *CI : llvm::drop_begin(MergableCIs)) { | ||||||
966 | OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc()); | ||||||
967 | if (CI != MergableCIs.back()) | ||||||
968 | OR << ", "; | ||||||
969 | } | ||||||
970 | return OR << "."; | ||||||
971 | }; | ||||||
972 | |||||||
973 | emitRemark<OptimizationRemark>(MergableCIs.front(), "OMP150", Remark); | ||||||
974 | |||||||
975 | Function *OriginalFn = BB->getParent(); | ||||||
976 | LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()do { } while (false) | ||||||
977 | << " parallel regions in " << OriginalFn->getName()do { } while (false) | ||||||
978 | << "\n")do { } while (false); | ||||||
979 | |||||||
980 | // Isolate the calls to merge in a separate block. | ||||||
981 | EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI); | ||||||
982 | BasicBlock *AfterBB = | ||||||
983 | SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI); | ||||||
984 | StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr, | ||||||
985 | "omp.par.merged"); | ||||||
986 | |||||||
987 | assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG")((void)0); | ||||||
988 | const DebugLoc DL = BB->getTerminator()->getDebugLoc(); | ||||||
989 | BB->getTerminator()->eraseFromParent(); | ||||||
990 | |||||||
991 | // Create sequential regions for sequential instructions that are | ||||||
992 | // in-between mergable parallel regions. | ||||||
993 | for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1; | ||||||
994 | It != End; ++It) { | ||||||
995 | Instruction *ForkCI = *It; | ||||||
996 | Instruction *NextForkCI = *(It + 1); | ||||||
997 | |||||||
998 | // Continue if there are not in-between instructions. | ||||||
999 | if (ForkCI->getNextNode() == NextForkCI) | ||||||
1000 | continue; | ||||||
1001 | |||||||
1002 | CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(), | ||||||
1003 | NextForkCI->getPrevNode()); | ||||||
1004 | } | ||||||
1005 | |||||||
1006 | OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()), | ||||||
1007 | DL); | ||||||
1008 | IRBuilder<>::InsertPoint AllocaIP( | ||||||
1009 | &OriginalFn->getEntryBlock(), | ||||||
1010 | OriginalFn->getEntryBlock().getFirstInsertionPt()); | ||||||
1011 | // Create the merged parallel region with default proc binding, to | ||||||
1012 | // avoid overriding binding settings, and without explicit cancellation. | ||||||
1013 | InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel( | ||||||
1014 | Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr, | ||||||
1015 | OMP_PROC_BIND_default, /* IsCancellable */ false); | ||||||
1016 | BranchInst::Create(AfterBB, AfterIP.getBlock()); | ||||||
1017 | |||||||
1018 | // Perform the actual outlining. | ||||||
1019 | OMPInfoCache.OMPBuilder.finalize(OriginalFn, | ||||||
1020 | /* AllowExtractorSinking */ true); | ||||||
1021 | |||||||
1022 | Function *OutlinedFn = MergableCIs.front()->getCaller(); | ||||||
1023 | |||||||
1024 | // Replace the __kmpc_fork_call calls with direct calls to the outlined | ||||||
1025 | // callbacks. | ||||||
1026 | SmallVector<Value *, 8> Args; | ||||||
1027 | for (auto *CI : MergableCIs) { | ||||||
1028 | Value *Callee = | ||||||
1029 | CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts(); | ||||||
1030 | FunctionType *FT = | ||||||
1031 | cast<FunctionType>(Callee->getType()->getPointerElementType()); | ||||||
1032 | Args.clear(); | ||||||
1033 | Args.push_back(OutlinedFn->getArg(0)); | ||||||
1034 | Args.push_back(OutlinedFn->getArg(1)); | ||||||
1035 | for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands(); | ||||||
1036 | U < E; ++U) | ||||||
1037 | Args.push_back(CI->getArgOperand(U)); | ||||||
1038 | |||||||
1039 | CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI); | ||||||
1040 | if (CI->getDebugLoc()) | ||||||
1041 | NewCI->setDebugLoc(CI->getDebugLoc()); | ||||||
1042 | |||||||
1043 | // Forward parameter attributes from the callback to the callee. | ||||||
1044 | for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands(); | ||||||
1045 | U < E; ++U) | ||||||
1046 | for (const Attribute &A : CI->getAttributes().getParamAttributes(U)) | ||||||
1047 | NewCI->addParamAttr( | ||||||
1048 | U - (CallbackFirstArgOperand - CallbackCalleeOperand), A); | ||||||
1049 | |||||||
1050 | // Emit an explicit barrier to replace the implicit fork-join barrier. | ||||||
1051 | if (CI != MergableCIs.back()) { | ||||||
1052 | // TODO: Remove barrier if the merged parallel region includes the | ||||||
1053 | // 'nowait' clause. | ||||||
1054 | OMPInfoCache.OMPBuilder.createBarrier( | ||||||
1055 | InsertPointTy(NewCI->getParent(), | ||||||
1056 | NewCI->getNextNode()->getIterator()), | ||||||
1057 | OMPD_parallel); | ||||||
1058 | } | ||||||
1059 | |||||||
1060 | CI->eraseFromParent(); | ||||||
1061 | } | ||||||
1062 | |||||||
1063 | assert(OutlinedFn != OriginalFn && "Outlining failed")((void)0); | ||||||
1064 | CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn); | ||||||
1065 | CGUpdater.reanalyzeFunction(*OriginalFn); | ||||||
1066 | |||||||
1067 | NumOpenMPParallelRegionsMerged += MergableCIs.size(); | ||||||
1068 | |||||||
1069 | return true; | ||||||
1070 | }; | ||||||
1071 | |||||||
1072 | // Helper function that identifes sequences of | ||||||
1073 | // __kmpc_fork_call uses in a basic block. | ||||||
1074 | auto DetectPRsCB = [&](Use &U, Function &F) { | ||||||
1075 | CallInst *CI = getCallIfRegularCall(U, &RFI); | ||||||
1076 | BB2PRMap[CI->getParent()].insert(CI); | ||||||
1077 | |||||||
1078 | return false; | ||||||
1079 | }; | ||||||
1080 | |||||||
1081 | BB2PRMap.clear(); | ||||||
1082 | RFI.foreachUse(SCC, DetectPRsCB); | ||||||
1083 | SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector; | ||||||
1084 | // Find mergable parallel regions within a basic block that are | ||||||
1085 | // safe to merge, that is any in-between instructions can safely | ||||||
1086 | // execute in parallel after merging. | ||||||
1087 | // TODO: support merging across basic-blocks. | ||||||
1088 | for (auto &It : BB2PRMap) { | ||||||
1089 | auto &CIs = It.getSecond(); | ||||||
1090 | if (CIs.size() < 2) | ||||||
1091 | continue; | ||||||
1092 | |||||||
1093 | BasicBlock *BB = It.getFirst(); | ||||||
1094 | SmallVector<CallInst *, 4> MergableCIs; | ||||||
1095 | |||||||
1096 | /// Returns true if the instruction is mergable, false otherwise. | ||||||
1097 | /// A terminator instruction is unmergable by definition since merging | ||||||
1098 | /// works within a BB. Instructions before the mergable region are | ||||||
1099 | /// mergable if they are not calls to OpenMP runtime functions that may | ||||||
1100 | /// set different execution parameters for subsequent parallel regions. | ||||||
1101 | /// Instructions in-between parallel regions are mergable if they are not | ||||||
1102 | /// calls to any non-intrinsic function since that may call a non-mergable | ||||||
1103 | /// OpenMP runtime function. | ||||||
1104 | auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) { | ||||||
1105 | // We do not merge across BBs, hence return false (unmergable) if the | ||||||
1106 | // instruction is a terminator. | ||||||
1107 | if (I.isTerminator()) | ||||||
1108 | return false; | ||||||
1109 | |||||||
1110 | if (!isa<CallInst>(&I)) | ||||||
1111 | return true; | ||||||
1112 | |||||||
1113 | CallInst *CI = cast<CallInst>(&I); | ||||||
1114 | if (IsBeforeMergableRegion) { | ||||||
1115 | Function *CalledFunction = CI->getCalledFunction(); | ||||||
1116 | if (!CalledFunction) | ||||||
1117 | return false; | ||||||
1118 | // Return false (unmergable) if the call before the parallel | ||||||
1119 | // region calls an explicit affinity (proc_bind) or number of | ||||||
1120 | // threads (num_threads) compiler-generated function. Those settings | ||||||
1121 | // may be incompatible with following parallel regions. | ||||||
1122 | // TODO: ICV tracking to detect compatibility. | ||||||
1123 | for (const auto &RFI : UnmergableCallsInfo) { | ||||||
1124 | if (CalledFunction == RFI.Declaration) | ||||||
1125 | return false; | ||||||
1126 | } | ||||||
1127 | } else { | ||||||
1128 | // Return false (unmergable) if there is a call instruction | ||||||
1129 | // in-between parallel regions when it is not an intrinsic. It | ||||||
1130 | // may call an unmergable OpenMP runtime function in its callpath. | ||||||
1131 | // TODO: Keep track of possible OpenMP calls in the callpath. | ||||||
1132 | if (!isa<IntrinsicInst>(CI)) | ||||||
1133 | return false; | ||||||
1134 | } | ||||||
1135 | |||||||
1136 | return true; | ||||||
1137 | }; | ||||||
1138 | // Find maximal number of parallel region CIs that are safe to merge. | ||||||
1139 | for (auto It = BB->begin(), End = BB->end(); It != End;) { | ||||||
1140 | Instruction &I = *It; | ||||||
1141 | ++It; | ||||||
1142 | |||||||
1143 | if (CIs.count(&I)) { | ||||||
1144 | MergableCIs.push_back(cast<CallInst>(&I)); | ||||||
1145 | continue; | ||||||
1146 | } | ||||||
1147 | |||||||
1148 | // Continue expanding if the instruction is mergable. | ||||||
1149 | if (IsMergable(I, MergableCIs.empty())) | ||||||
1150 | continue; | ||||||
1151 | |||||||
1152 | // Forward the instruction iterator to skip the next parallel region | ||||||
1153 | // since there is an unmergable instruction which can affect it. | ||||||
1154 | for (; It != End; ++It) { | ||||||
1155 | Instruction &SkipI = *It; | ||||||
1156 | if (CIs.count(&SkipI)) { | ||||||
1157 | LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipIdo { } while (false) | ||||||
1158 | << " due to " << I << "\n")do { } while (false); | ||||||
1159 | ++It; | ||||||
1160 | break; | ||||||
1161 | } | ||||||
1162 | } | ||||||
1163 | |||||||
1164 | // Store mergable regions found. | ||||||
1165 | if (MergableCIs.size() > 1) { | ||||||
1166 | MergableCIsVector.push_back(MergableCIs); | ||||||
1167 | LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()do { } while (false) | ||||||
1168 | << " parallel regions in block " << BB->getName()do { } while (false) | ||||||
1169 | << " of function " << BB->getParent()->getName()do { } while (false) | ||||||
1170 | << "\n";)do { } while (false); | ||||||
1171 | } | ||||||
1172 | |||||||
1173 | MergableCIs.clear(); | ||||||
1174 | } | ||||||
1175 | |||||||
1176 | if (!MergableCIsVector.empty()) { | ||||||
1177 | Changed = true; | ||||||
1178 | |||||||
1179 | for (auto &MergableCIs : MergableCIsVector) | ||||||
1180 | Merge(MergableCIs, BB); | ||||||
1181 | MergableCIsVector.clear(); | ||||||
1182 | } | ||||||
1183 | } | ||||||
1184 | |||||||
1185 | if (Changed) { | ||||||
1186 | /// Re-collect use for fork calls, emitted barrier calls, and | ||||||
1187 | /// any emitted master/end_master calls. | ||||||
1188 | OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call); | ||||||
1189 | OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier); | ||||||
1190 | OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master); | ||||||
1191 | OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master); | ||||||
1192 | } | ||||||
1193 | |||||||
1194 | return Changed; | ||||||
1195 | } | ||||||
1196 | |||||||
1197 | /// Try to delete parallel regions if possible. | ||||||
1198 | bool deleteParallelRegions() { | ||||||
1199 | const unsigned CallbackCalleeOperand = 2; | ||||||
1200 | |||||||
1201 | OMPInformationCache::RuntimeFunctionInfo &RFI = | ||||||
1202 | OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; | ||||||
1203 | |||||||
1204 | if (!RFI.Declaration) | ||||||
1205 | return false; | ||||||
1206 | |||||||
1207 | bool Changed = false; | ||||||
1208 | auto DeleteCallCB = [&](Use &U, Function &) { | ||||||
1209 | CallInst *CI = getCallIfRegularCall(U); | ||||||
1210 | if (!CI) | ||||||
1211 | return false; | ||||||
1212 | auto *Fn = dyn_cast<Function>( | ||||||
1213 | CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts()); | ||||||
1214 | if (!Fn) | ||||||
1215 | return false; | ||||||
1216 | if (!Fn->onlyReadsMemory()) | ||||||
1217 | return false; | ||||||
1218 | if (!Fn->hasFnAttribute(Attribute::WillReturn)) | ||||||
1219 | return false; | ||||||
1220 | |||||||
1221 | LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "do { } while (false) | ||||||
1222 | << CI->getCaller()->getName() << "\n")do { } while (false); | ||||||
1223 | |||||||
1224 | auto Remark = [&](OptimizationRemark OR) { | ||||||
1225 | return OR << "Removing parallel region with no side-effects."; | ||||||
1226 | }; | ||||||
1227 | emitRemark<OptimizationRemark>(CI, "OMP160", Remark); | ||||||
1228 | |||||||
1229 | CGUpdater.removeCallSite(*CI); | ||||||
1230 | CI->eraseFromParent(); | ||||||
1231 | Changed = true; | ||||||
1232 | ++NumOpenMPParallelRegionsDeleted; | ||||||
1233 | return true; | ||||||
1234 | }; | ||||||
1235 | |||||||
1236 | RFI.foreachUse(SCC, DeleteCallCB); | ||||||
1237 | |||||||
1238 | return Changed; | ||||||
1239 | } | ||||||
1240 | |||||||
1241 | /// Try to eliminate runtime calls by reusing existing ones. | ||||||
1242 | bool deduplicateRuntimeCalls() { | ||||||
1243 | bool Changed = false; | ||||||
1244 | |||||||
1245 | RuntimeFunction DeduplicableRuntimeCallIDs[] = { | ||||||
1246 | OMPRTL_omp_get_num_threads, | ||||||
1247 | OMPRTL_omp_in_parallel, | ||||||
1248 | OMPRTL_omp_get_cancellation, | ||||||
1249 | OMPRTL_omp_get_thread_limit, | ||||||
1250 | OMPRTL_omp_get_supported_active_levels, | ||||||
1251 | OMPRTL_omp_get_level, | ||||||
1252 | OMPRTL_omp_get_ancestor_thread_num, | ||||||
1253 | OMPRTL_omp_get_team_size, | ||||||
1254 | OMPRTL_omp_get_active_level, | ||||||
1255 | OMPRTL_omp_in_final, | ||||||
1256 | OMPRTL_omp_get_proc_bind, | ||||||
1257 | OMPRTL_omp_get_num_places, | ||||||
1258 | OMPRTL_omp_get_num_procs, | ||||||
1259 | OMPRTL_omp_get_place_num, | ||||||
1260 | OMPRTL_omp_get_partition_num_places, | ||||||
1261 | OMPRTL_omp_get_partition_place_nums}; | ||||||
1262 | |||||||
1263 | // Global-tid is handled separately. | ||||||
1264 | SmallSetVector<Value *, 16> GTIdArgs; | ||||||
1265 | collectGlobalThreadIdArguments(GTIdArgs); | ||||||
1266 | LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()do { } while (false) | ||||||
1267 | << " global thread ID arguments\n")do { } while (false); | ||||||
1268 | |||||||
1269 | for (Function *F : SCC) { | ||||||
1270 | for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs) | ||||||
1271 | Changed |= deduplicateRuntimeCalls( | ||||||
1272 | *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]); | ||||||
1273 | |||||||
1274 | // __kmpc_global_thread_num is special as we can replace it with an | ||||||
1275 | // argument in enough cases to make it worth trying. | ||||||
1276 | Value *GTIdArg = nullptr; | ||||||
1277 | for (Argument &Arg : F->args()) | ||||||
1278 | if (GTIdArgs.count(&Arg)) { | ||||||
1279 | GTIdArg = &Arg; | ||||||
1280 | break; | ||||||
1281 | } | ||||||
1282 | Changed |= deduplicateRuntimeCalls( | ||||||
1283 | *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg); | ||||||
1284 | } | ||||||
1285 | |||||||
1286 | return Changed; | ||||||
1287 | } | ||||||
1288 | |||||||
1289 | /// Tries to hide the latency of runtime calls that involve host to | ||||||
1290 | /// device memory transfers by splitting them into their "issue" and "wait" | ||||||
1291 | /// versions. The "issue" is moved upwards as much as possible. The "wait" is | ||||||
1292 | /// moved downards as much as possible. The "issue" issues the memory transfer | ||||||
1293 | /// asynchronously, returning a handle. The "wait" waits in the returned | ||||||
1294 | /// handle for the memory transfer to finish. | ||||||
1295 | bool hideMemTransfersLatency() { | ||||||
1296 | auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper]; | ||||||
1297 | bool Changed = false; | ||||||
1298 | auto SplitMemTransfers = [&](Use &U, Function &Decl) { | ||||||
1299 | auto *RTCall = getCallIfRegularCall(U, &RFI); | ||||||
1300 | if (!RTCall) | ||||||
1301 | return false; | ||||||
1302 | |||||||
1303 | OffloadArray OffloadArrays[3]; | ||||||
1304 | if (!getValuesInOffloadArrays(*RTCall, OffloadArrays)) | ||||||
1305 | return false; | ||||||
1306 | |||||||
1307 | LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays))do { } while (false); | ||||||
1308 | |||||||
1309 | // TODO: Check if can be moved upwards. | ||||||
1310 | bool WasSplit = false; | ||||||
1311 | Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall); | ||||||
1312 | if (WaitMovementPoint) | ||||||
1313 | WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint); | ||||||
1314 | |||||||
1315 | Changed |= WasSplit; | ||||||
1316 | return WasSplit; | ||||||
1317 | }; | ||||||
1318 | RFI.foreachUse(SCC, SplitMemTransfers); | ||||||
1319 | |||||||
1320 | return Changed; | ||||||
1321 | } | ||||||
1322 | |||||||
1323 | void analysisGlobalization() { | ||||||
1324 | auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; | ||||||
1325 | |||||||
1326 | auto CheckGlobalization = [&](Use &U, Function &Decl) { | ||||||
1327 | if (CallInst *CI = getCallIfRegularCall(U, &RFI)) { | ||||||
1328 | auto Remark = [&](OptimizationRemarkMissed ORM) { | ||||||
1329 | return ORM | ||||||
1330 | << "Found thread data sharing on the GPU. " | ||||||
1331 | << "Expect degraded performance due to data globalization."; | ||||||
1332 | }; | ||||||
1333 | emitRemark<OptimizationRemarkMissed>(CI, "OMP112", Remark); | ||||||
1334 | } | ||||||
1335 | |||||||
1336 | return false; | ||||||
1337 | }; | ||||||
1338 | |||||||
1339 | RFI.foreachUse(SCC, CheckGlobalization); | ||||||
1340 | } | ||||||
1341 | |||||||
1342 | /// Maps the values stored in the offload arrays passed as arguments to | ||||||
1343 | /// \p RuntimeCall into the offload arrays in \p OAs. | ||||||
1344 | bool getValuesInOffloadArrays(CallInst &RuntimeCall, | ||||||
1345 | MutableArrayRef<OffloadArray> OAs) { | ||||||
1346 | assert(OAs.size() == 3 && "Need space for three offload arrays!")((void)0); | ||||||
1347 | |||||||
1348 | // A runtime call that involves memory offloading looks something like: | ||||||
1349 | // call void @__tgt_target_data_begin_mapper(arg0, arg1, | ||||||
1350 | // i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes, | ||||||
1351 | // ...) | ||||||
1352 | // So, the idea is to access the allocas that allocate space for these | ||||||
1353 | // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes. | ||||||
1354 | // Therefore: | ||||||
1355 | // i8** %offload_baseptrs. | ||||||
1356 | Value *BasePtrsArg = | ||||||
1357 | RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum); | ||||||
1358 | // i8** %offload_ptrs. | ||||||
1359 | Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum); | ||||||
1360 | // i8** %offload_sizes. | ||||||
1361 | Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum); | ||||||
1362 | |||||||
1363 | // Get values stored in **offload_baseptrs. | ||||||
1364 | auto *V = getUnderlyingObject(BasePtrsArg); | ||||||
1365 | if (!isa<AllocaInst>(V)) | ||||||
1366 | return false; | ||||||
1367 | auto *BasePtrsArray = cast<AllocaInst>(V); | ||||||
1368 | if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall)) | ||||||
1369 | return false; | ||||||
1370 | |||||||
1371 | // Get values stored in **offload_baseptrs. | ||||||
1372 | V = getUnderlyingObject(PtrsArg); | ||||||
1373 | if (!isa<AllocaInst>(V)) | ||||||
1374 | return false; | ||||||
1375 | auto *PtrsArray = cast<AllocaInst>(V); | ||||||
1376 | if (!OAs[1].initialize(*PtrsArray, RuntimeCall)) | ||||||
1377 | return false; | ||||||
1378 | |||||||
1379 | // Get values stored in **offload_sizes. | ||||||
1380 | V = getUnderlyingObject(SizesArg); | ||||||
1381 | // If it's a [constant] global array don't analyze it. | ||||||
1382 | if (isa<GlobalValue>(V)) | ||||||
1383 | return isa<Constant>(V); | ||||||
1384 | if (!isa<AllocaInst>(V)) | ||||||
1385 | return false; | ||||||
1386 | |||||||
1387 | auto *SizesArray = cast<AllocaInst>(V); | ||||||
1388 | if (!OAs[2].initialize(*SizesArray, RuntimeCall)) | ||||||
1389 | return false; | ||||||
1390 | |||||||
1391 | return true; | ||||||
1392 | } | ||||||
1393 | |||||||
1394 | /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG. | ||||||
1395 | /// For now this is a way to test that the function getValuesInOffloadArrays | ||||||
1396 | /// is working properly. | ||||||
1397 | /// TODO: Move this to a unittest when unittests are available for OpenMPOpt. | ||||||
1398 | void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) { | ||||||
1399 | assert(OAs.size() == 3 && "There are three offload arrays to debug!")((void)0); | ||||||
1400 | |||||||
1401 | LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n")do { } while (false); | ||||||
1402 | std::string ValuesStr; | ||||||
1403 | raw_string_ostream Printer(ValuesStr); | ||||||
1404 | std::string Separator = " --- "; | ||||||
1405 | |||||||
1406 | for (auto *BP : OAs[0].StoredValues) { | ||||||
1407 | BP->print(Printer); | ||||||
1408 | Printer << Separator; | ||||||
1409 | } | ||||||
1410 | LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n")do { } while (false); | ||||||
1411 | ValuesStr.clear(); | ||||||
1412 | |||||||
1413 | for (auto *P : OAs[1].StoredValues) { | ||||||
1414 | P->print(Printer); | ||||||
1415 | Printer << Separator; | ||||||
1416 | } | ||||||
1417 | LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n")do { } while (false); | ||||||
1418 | ValuesStr.clear(); | ||||||
1419 | |||||||
1420 | for (auto *S : OAs[2].StoredValues) { | ||||||
1421 | S->print(Printer); | ||||||
1422 | Printer << Separator; | ||||||
1423 | } | ||||||
1424 | LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n")do { } while (false); | ||||||
1425 | } | ||||||
1426 | |||||||
1427 | /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be | ||||||
1428 | /// moved. Returns nullptr if the movement is not possible, or not worth it. | ||||||
1429 | Instruction *canBeMovedDownwards(CallInst &RuntimeCall) { | ||||||
1430 | // FIXME: This traverses only the BasicBlock where RuntimeCall is. | ||||||
1431 | // Make it traverse the CFG. | ||||||
1432 | |||||||
1433 | Instruction *CurrentI = &RuntimeCall; | ||||||
1434 | bool IsWorthIt = false; | ||||||
1435 | while ((CurrentI = CurrentI->getNextNode())) { | ||||||
1436 | |||||||
1437 | // TODO: Once we detect the regions to be offloaded we should use the | ||||||
1438 | // alias analysis manager to check if CurrentI may modify one of | ||||||
1439 | // the offloaded regions. | ||||||
1440 | if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) { | ||||||
1441 | if (IsWorthIt) | ||||||
1442 | return CurrentI; | ||||||
1443 | |||||||
1444 | return nullptr; | ||||||
1445 | } | ||||||
1446 | |||||||
1447 | // FIXME: For now if we move it over anything without side effect | ||||||
1448 | // is worth it. | ||||||
1449 | IsWorthIt = true; | ||||||
1450 | } | ||||||
1451 | |||||||
1452 | // Return end of BasicBlock. | ||||||
1453 | return RuntimeCall.getParent()->getTerminator(); | ||||||
1454 | } | ||||||
1455 | |||||||
1456 | /// Splits \p RuntimeCall into its "issue" and "wait" counterparts. | ||||||
1457 | bool splitTargetDataBeginRTC(CallInst &RuntimeCall, | ||||||
1458 | Instruction &WaitMovementPoint) { | ||||||
1459 | // Create stack allocated handle (__tgt_async_info) at the beginning of the | ||||||
1460 | // function. Used for storing information of the async transfer, allowing to | ||||||
1461 | // wait on it later. | ||||||
1462 | auto &IRBuilder = OMPInfoCache.OMPBuilder; | ||||||
1463 | auto *F = RuntimeCall.getCaller(); | ||||||
1464 | Instruction *FirstInst = &(F->getEntryBlock().front()); | ||||||
1465 | AllocaInst *Handle = new AllocaInst( | ||||||
1466 | IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst); | ||||||
1467 | |||||||
1468 | // Add "issue" runtime call declaration: | ||||||
1469 | // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32, | ||||||
1470 | // i8**, i8**, i64*, i64*) | ||||||
1471 | FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction( | ||||||
1472 | M, OMPRTL___tgt_target_data_begin_mapper_issue); | ||||||
1473 | |||||||
1474 | // Change RuntimeCall call site for its asynchronous version. | ||||||
1475 | SmallVector<Value *, 16> Args; | ||||||
1476 | for (auto &Arg : RuntimeCall.args()) | ||||||
1477 | Args.push_back(Arg.get()); | ||||||
1478 | Args.push_back(Handle); | ||||||
1479 | |||||||
1480 | CallInst *IssueCallsite = | ||||||
1481 | CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall); | ||||||
1482 | RuntimeCall.eraseFromParent(); | ||||||
1483 | |||||||
1484 | // Add "wait" runtime call declaration: | ||||||
1485 | // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info) | ||||||
1486 | FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction( | ||||||
1487 | M, OMPRTL___tgt_target_data_begin_mapper_wait); | ||||||
1488 | |||||||
1489 | Value *WaitParams[2] = { | ||||||
1490 | IssueCallsite->getArgOperand( | ||||||
1491 | OffloadArray::DeviceIDArgNum), // device_id. | ||||||
1492 | Handle // handle to wait on. | ||||||
1493 | }; | ||||||
1494 | CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint); | ||||||
1495 | |||||||
1496 | return true; | ||||||
1497 | } | ||||||
1498 | |||||||
1499 | static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent, | ||||||
1500 | bool GlobalOnly, bool &SingleChoice) { | ||||||
1501 | if (CurrentIdent == NextIdent) | ||||||
1502 | return CurrentIdent; | ||||||
1503 | |||||||
1504 | // TODO: Figure out how to actually combine multiple debug locations. For | ||||||
1505 | // now we just keep an existing one if there is a single choice. | ||||||
1506 | if (!GlobalOnly || isa<GlobalValue>(NextIdent)) { | ||||||
1507 | SingleChoice = !CurrentIdent; | ||||||
1508 | return NextIdent; | ||||||
1509 | } | ||||||
1510 | return nullptr; | ||||||
1511 | } | ||||||
1512 | |||||||
1513 | /// Return an `struct ident_t*` value that represents the ones used in the | ||||||
1514 | /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not | ||||||
1515 | /// return a local `struct ident_t*`. For now, if we cannot find a suitable | ||||||
1516 | /// return value we create one from scratch. We also do not yet combine | ||||||
1517 | /// information, e.g., the source locations, see combinedIdentStruct. | ||||||
1518 | Value * | ||||||
1519 | getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI, | ||||||
1520 | Function &F, bool GlobalOnly) { | ||||||
1521 | bool SingleChoice = true; | ||||||
1522 | Value *Ident = nullptr; | ||||||
1523 | auto CombineIdentStruct = [&](Use &U, Function &Caller) { | ||||||
1524 | CallInst *CI = getCallIfRegularCall(U, &RFI); | ||||||
1525 | if (!CI || &F != &Caller) | ||||||
1526 | return false; | ||||||
1527 | Ident = combinedIdentStruct(Ident, CI->getArgOperand(0), | ||||||
1528 | /* GlobalOnly */ true, SingleChoice); | ||||||
1529 | return false; | ||||||
1530 | }; | ||||||
1531 | RFI.foreachUse(SCC, CombineIdentStruct); | ||||||
1532 | |||||||
1533 | if (!Ident || !SingleChoice) { | ||||||
1534 | // The IRBuilder uses the insertion block to get to the module, this is | ||||||
1535 | // unfortunate but we work around it for now. | ||||||
1536 | if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock()) | ||||||
1537 | OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy( | ||||||
1538 | &F.getEntryBlock(), F.getEntryBlock().begin())); | ||||||
1539 | // Create a fallback location if non was found. | ||||||
1540 | // TODO: Use the debug locations of the calls instead. | ||||||
1541 | Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(); | ||||||
1542 | Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc); | ||||||
1543 | } | ||||||
1544 | return Ident; | ||||||
1545 | } | ||||||
1546 | |||||||
1547 | /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or | ||||||
1548 | /// \p ReplVal if given. | ||||||
1549 | bool deduplicateRuntimeCalls(Function &F, | ||||||
1550 | OMPInformationCache::RuntimeFunctionInfo &RFI, | ||||||
1551 | Value *ReplVal = nullptr) { | ||||||
1552 | auto *UV = RFI.getUseVector(F); | ||||||
1553 | if (!UV || UV->size() + (ReplVal != nullptr) < 2) | ||||||
1554 | return false; | ||||||
1555 | |||||||
1556 | LLVM_DEBUG(do { } while (false) | ||||||
1557 | dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Namedo { } while (false) | ||||||
1558 | << (ReplVal ? " with an existing value\n" : "\n") << "\n")do { } while (false); | ||||||
1559 | |||||||
1560 | assert((!ReplVal || (isa<Argument>(ReplVal) &&((void)0) | ||||||
1561 | cast<Argument>(ReplVal)->getParent() == &F)) &&((void)0) | ||||||
1562 | "Unexpected replacement value!")((void)0); | ||||||
1563 | |||||||
1564 | // TODO: Use dominance to find a good position instead. | ||||||
1565 | auto CanBeMoved = [this](CallBase &CB) { | ||||||
1566 | unsigned NumArgs = CB.getNumArgOperands(); | ||||||
1567 | if (NumArgs == 0) | ||||||
1568 | return true; | ||||||
1569 | if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr) | ||||||
1570 | return false; | ||||||
1571 | for (unsigned u = 1; u < NumArgs; ++u) | ||||||
1572 | if (isa<Instruction>(CB.getArgOperand(u))) | ||||||
1573 | return false; | ||||||
1574 | return true; | ||||||
1575 | }; | ||||||
1576 | |||||||
1577 | if (!ReplVal) { | ||||||
1578 | for (Use *U : *UV) | ||||||
1579 | if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) { | ||||||
1580 | if (!CanBeMoved(*CI)) | ||||||
1581 | continue; | ||||||
1582 | |||||||
1583 | // If the function is a kernel, dedup will move | ||||||
1584 | // the runtime call right after the kernel init callsite. Otherwise, | ||||||
1585 | // it will move it to the beginning of the caller function. | ||||||
1586 | if (isKernel(F)) { | ||||||
1587 | auto &KernelInitRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; | ||||||
1588 | auto *KernelInitUV = KernelInitRFI.getUseVector(F); | ||||||
1589 | |||||||
1590 | if (KernelInitUV->empty()) | ||||||
1591 | continue; | ||||||
1592 | |||||||
1593 | assert(KernelInitUV->size() == 1 &&((void)0) | ||||||
1594 | "Expected a single __kmpc_target_init in kernel\n")((void)0); | ||||||
1595 | |||||||
1596 | CallInst *KernelInitCI = | ||||||
1597 | getCallIfRegularCall(*KernelInitUV->front(), &KernelInitRFI); | ||||||
1598 | assert(KernelInitCI &&((void)0) | ||||||
1599 | "Expected a call to __kmpc_target_init in kernel\n")((void)0); | ||||||
1600 | |||||||
1601 | CI->moveAfter(KernelInitCI); | ||||||
1602 | } else | ||||||
1603 | CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt()); | ||||||
1604 | ReplVal = CI; | ||||||
1605 | break; | ||||||
1606 | } | ||||||
1607 | if (!ReplVal) | ||||||
1608 | return false; | ||||||
1609 | } | ||||||
1610 | |||||||
1611 | // If we use a call as a replacement value we need to make sure the ident is | ||||||
1612 | // valid at the new location. For now we just pick a global one, either | ||||||
1613 | // existing and used by one of the calls, or created from scratch. | ||||||
1614 | if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) { | ||||||
1615 | if (CI->getNumArgOperands() > 0 && | ||||||
1616 | CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) { | ||||||
1617 | Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F, | ||||||
1618 | /* GlobalOnly */ true); | ||||||
1619 | CI->setArgOperand(0, Ident); | ||||||
1620 | } | ||||||
1621 | } | ||||||
1622 | |||||||
1623 | bool Changed = false; | ||||||
1624 | auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) { | ||||||
1625 | CallInst *CI = getCallIfRegularCall(U, &RFI); | ||||||
1626 | if (!CI || CI == ReplVal || &F != &Caller) | ||||||
1627 | return false; | ||||||
1628 | assert(CI->getCaller() == &F && "Unexpected call!")((void)0); | ||||||
1629 | |||||||
1630 | auto Remark = [&](OptimizationRemark OR) { | ||||||
1631 | return OR << "OpenMP runtime call " | ||||||
1632 | << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated."; | ||||||
1633 | }; | ||||||
1634 | if (CI->getDebugLoc()) | ||||||
1635 | emitRemark<OptimizationRemark>(CI, "OMP170", Remark); | ||||||
1636 | else | ||||||
1637 | emitRemark<OptimizationRemark>(&F, "OMP170", Remark); | ||||||
1638 | |||||||
1639 | CGUpdater.removeCallSite(*CI); | ||||||
1640 | CI->replaceAllUsesWith(ReplVal); | ||||||
1641 | CI->eraseFromParent(); | ||||||
1642 | ++NumOpenMPRuntimeCallsDeduplicated; | ||||||
1643 | Changed = true; | ||||||
1644 | return true; | ||||||
1645 | }; | ||||||
1646 | RFI.foreachUse(SCC, ReplaceAndDeleteCB); | ||||||
1647 | |||||||
1648 | return Changed; | ||||||
1649 | } | ||||||
1650 | |||||||
1651 | /// Collect arguments that represent the global thread id in \p GTIdArgs. | ||||||
1652 | void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> >IdArgs) { | ||||||
1653 | // TODO: Below we basically perform a fixpoint iteration with a pessimistic | ||||||
1654 | // initialization. We could define an AbstractAttribute instead and | ||||||
1655 | // run the Attributor here once it can be run as an SCC pass. | ||||||
1656 | |||||||
1657 | // Helper to check the argument \p ArgNo at all call sites of \p F for | ||||||
1658 | // a GTId. | ||||||
1659 | auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) { | ||||||
1660 | if (!F.hasLocalLinkage()) | ||||||
1661 | return false; | ||||||
1662 | for (Use &U : F.uses()) { | ||||||
1663 | if (CallInst *CI = getCallIfRegularCall(U)) { | ||||||
1664 | Value *ArgOp = CI->getArgOperand(ArgNo); | ||||||
1665 | if (CI == &RefCI || GTIdArgs.count(ArgOp) || | ||||||
1666 | getCallIfRegularCall( | ||||||
1667 | *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num])) | ||||||
1668 | continue; | ||||||
1669 | } | ||||||
1670 | return false; | ||||||
1671 | } | ||||||
1672 | return true; | ||||||
1673 | }; | ||||||
1674 | |||||||
1675 | // Helper to identify uses of a GTId as GTId arguments. | ||||||
1676 | auto AddUserArgs = [&](Value >Id) { | ||||||
1677 | for (Use &U : GTId.uses()) | ||||||
1678 | if (CallInst *CI = dyn_cast<CallInst>(U.getUser())) | ||||||
1679 | if (CI->isArgOperand(&U)) | ||||||
1680 | if (Function *Callee = CI->getCalledFunction()) | ||||||
1681 | if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI)) | ||||||
1682 | GTIdArgs.insert(Callee->getArg(U.getOperandNo())); | ||||||
1683 | }; | ||||||
1684 | |||||||
1685 | // The argument users of __kmpc_global_thread_num calls are GTIds. | ||||||
1686 | OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI = | ||||||
1687 | OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]; | ||||||
1688 | |||||||
1689 | GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) { | ||||||
1690 | if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI)) | ||||||
1691 | AddUserArgs(*CI); | ||||||
1692 | return false; | ||||||
1693 | }); | ||||||
1694 | |||||||
1695 | // Transitively search for more arguments by looking at the users of the | ||||||
1696 | // ones we know already. During the search the GTIdArgs vector is extended | ||||||
1697 | // so we cannot cache the size nor can we use a range based for. | ||||||
1698 | for (unsigned u = 0; u < GTIdArgs.size(); ++u) | ||||||
1699 | AddUserArgs(*GTIdArgs[u]); | ||||||
1700 | } | ||||||
1701 | |||||||
1702 | /// Kernel (=GPU) optimizations and utility functions | ||||||
1703 | /// | ||||||
1704 | ///{{ | ||||||
1705 | |||||||
1706 | /// Check if \p F is a kernel, hence entry point for target offloading. | ||||||
1707 | bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); } | ||||||
1708 | |||||||
1709 | /// Cache to remember the unique kernel for a function. | ||||||
1710 | DenseMap<Function *, Optional<Kernel>> UniqueKernelMap; | ||||||
1711 | |||||||
1712 | /// Find the unique kernel that will execute \p F, if any. | ||||||
1713 | Kernel getUniqueKernelFor(Function &F); | ||||||
1714 | |||||||
1715 | /// Find the unique kernel that will execute \p I, if any. | ||||||
1716 | Kernel getUniqueKernelFor(Instruction &I) { | ||||||
1717 | return getUniqueKernelFor(*I.getFunction()); | ||||||
1718 | } | ||||||
1719 | |||||||
1720 | /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in | ||||||
1721 | /// the cases we can avoid taking the address of a function. | ||||||
1722 | bool rewriteDeviceCodeStateMachine(); | ||||||
1723 | |||||||
1724 | /// | ||||||
1725 | ///}} | ||||||
1726 | |||||||
1727 | /// Emit a remark generically | ||||||
1728 | /// | ||||||
1729 | /// This template function can be used to generically emit a remark. The | ||||||
1730 | /// RemarkKind should be one of the following: | ||||||
1731 | /// - OptimizationRemark to indicate a successful optimization attempt | ||||||
1732 | /// - OptimizationRemarkMissed to report a failed optimization attempt | ||||||
1733 | /// - OptimizationRemarkAnalysis to provide additional information about an | ||||||
1734 | /// optimization attempt | ||||||
1735 | /// | ||||||
1736 | /// The remark is built using a callback function provided by the caller that | ||||||
1737 | /// takes a RemarkKind as input and returns a RemarkKind. | ||||||
1738 | template <typename RemarkKind, typename RemarkCallBack> | ||||||
1739 | void emitRemark(Instruction *I, StringRef RemarkName, | ||||||
1740 | RemarkCallBack &&RemarkCB) const { | ||||||
1741 | Function *F = I->getParent()->getParent(); | ||||||
1742 | auto &ORE = OREGetter(F); | ||||||
1743 | |||||||
1744 | if (RemarkName.startswith("OMP")) | ||||||
1745 | ORE.emit([&]() { | ||||||
1746 | return RemarkCB(RemarkKind(DEBUG_TYPE"openmp-opt", RemarkName, I)) | ||||||
1747 | << " [" << RemarkName << "]"; | ||||||
1748 | }); | ||||||
1749 | else | ||||||
1750 | ORE.emit( | ||||||
1751 | [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE"openmp-opt", RemarkName, I)); }); | ||||||
1752 | } | ||||||
1753 | |||||||
1754 | /// Emit a remark on a function. | ||||||
1755 | template <typename RemarkKind, typename RemarkCallBack> | ||||||
1756 | void emitRemark(Function *F, StringRef RemarkName, | ||||||
1757 | RemarkCallBack &&RemarkCB) const { | ||||||
1758 | auto &ORE = OREGetter(F); | ||||||
1759 | |||||||
1760 | if (RemarkName.startswith("OMP")) | ||||||
1761 | ORE.emit([&]() { | ||||||
1762 | return RemarkCB(RemarkKind(DEBUG_TYPE"openmp-opt", RemarkName, F)) | ||||||
1763 | << " [" << RemarkName << "]"; | ||||||
1764 | }); | ||||||
1765 | else | ||||||
1766 | ORE.emit( | ||||||
1767 | [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE"openmp-opt", RemarkName, F)); }); | ||||||
1768 | } | ||||||
1769 | |||||||
1770 | /// RAII struct to temporarily change an RTL function's linkage to external. | ||||||
1771 | /// This prevents it from being mistakenly removed by other optimizations. | ||||||
1772 | struct ExternalizationRAII { | ||||||
1773 | ExternalizationRAII(OMPInformationCache &OMPInfoCache, | ||||||
1774 | RuntimeFunction RFKind) | ||||||
1775 | : Declaration(OMPInfoCache.RFIs[RFKind].Declaration) { | ||||||
1776 | if (!Declaration) | ||||||
1777 | return; | ||||||
1778 | |||||||
1779 | LinkageType = Declaration->getLinkage(); | ||||||
1780 | Declaration->setLinkage(GlobalValue::ExternalLinkage); | ||||||
1781 | } | ||||||
1782 | |||||||
1783 | ~ExternalizationRAII() { | ||||||
1784 | if (!Declaration) | ||||||
1785 | return; | ||||||
1786 | |||||||
1787 | Declaration->setLinkage(LinkageType); | ||||||
1788 | } | ||||||
1789 | |||||||
1790 | Function *Declaration; | ||||||
1791 | GlobalValue::LinkageTypes LinkageType; | ||||||
1792 | }; | ||||||
1793 | |||||||
1794 | /// The underlying module. | ||||||
1795 | Module &M; | ||||||
1796 | |||||||
1797 | /// The SCC we are operating on. | ||||||
1798 | SmallVectorImpl<Function *> &SCC; | ||||||
1799 | |||||||
1800 | /// Callback to update the call graph, the first argument is a removed call, | ||||||
1801 | /// the second an optional replacement call. | ||||||
1802 | CallGraphUpdater &CGUpdater; | ||||||
1803 | |||||||
1804 | /// Callback to get an OptimizationRemarkEmitter from a Function * | ||||||
1805 | OptimizationRemarkGetter OREGetter; | ||||||
1806 | |||||||
1807 | /// OpenMP-specific information cache. Also Used for Attributor runs. | ||||||
1808 | OMPInformationCache &OMPInfoCache; | ||||||
1809 | |||||||
1810 | /// Attributor instance. | ||||||
1811 | Attributor &A; | ||||||
1812 | |||||||
1813 | /// Helper function to run Attributor on SCC. | ||||||
1814 | bool runAttributor(bool IsModulePass) { | ||||||
1815 | if (SCC.empty()) | ||||||
1816 | return false; | ||||||
1817 | |||||||
1818 | // Temporarily make these function have external linkage so the Attributor | ||||||
1819 | // doesn't remove them when we try to look them up later. | ||||||
1820 | ExternalizationRAII Parallel(OMPInfoCache, OMPRTL___kmpc_kernel_parallel); | ||||||
1821 | ExternalizationRAII EndParallel(OMPInfoCache, | ||||||
1822 | OMPRTL___kmpc_kernel_end_parallel); | ||||||
1823 | ExternalizationRAII BarrierSPMD(OMPInfoCache, | ||||||
1824 | OMPRTL___kmpc_barrier_simple_spmd); | ||||||
1825 | |||||||
1826 | registerAAs(IsModulePass); | ||||||
1827 | |||||||
1828 | ChangeStatus Changed = A.run(); | ||||||
1829 | |||||||
1830 | LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()do { } while (false) | ||||||
1831 | << " functions, result: " << Changed << ".\n")do { } while (false); | ||||||
1832 | |||||||
1833 | return Changed == ChangeStatus::CHANGED; | ||||||
1834 | } | ||||||
1835 | |||||||
1836 | void registerFoldRuntimeCall(RuntimeFunction RF); | ||||||
1837 | |||||||
1838 | /// Populate the Attributor with abstract attribute opportunities in the | ||||||
1839 | /// function. | ||||||
1840 | void registerAAs(bool IsModulePass); | ||||||
1841 | }; | ||||||
1842 | |||||||
1843 | Kernel OpenMPOpt::getUniqueKernelFor(Function &F) { | ||||||
1844 | if (!OMPInfoCache.ModuleSlice.count(&F)) | ||||||
1845 | return nullptr; | ||||||
1846 | |||||||
1847 | // Use a scope to keep the lifetime of the CachedKernel short. | ||||||
1848 | { | ||||||
1849 | Optional<Kernel> &CachedKernel = UniqueKernelMap[&F]; | ||||||
1850 | if (CachedKernel) | ||||||
1851 | return *CachedKernel; | ||||||
1852 | |||||||
1853 | // TODO: We should use an AA to create an (optimistic and callback | ||||||
1854 | // call-aware) call graph. For now we stick to simple patterns that | ||||||
1855 | // are less powerful, basically the worst fixpoint. | ||||||
1856 | if (isKernel(F)) { | ||||||
1857 | CachedKernel = Kernel(&F); | ||||||
1858 | return *CachedKernel; | ||||||
1859 | } | ||||||
1860 | |||||||
1861 | CachedKernel = nullptr; | ||||||
1862 | if (!F.hasLocalLinkage()) { | ||||||
1863 | |||||||
1864 | // See https://openmp.llvm.org/remarks/OptimizationRemarks.html | ||||||
1865 | auto Remark = [&](OptimizationRemarkAnalysis ORA) { | ||||||
1866 | return ORA << "Potentially unknown OpenMP target region caller."; | ||||||
1867 | }; | ||||||
1868 | emitRemark<OptimizationRemarkAnalysis>(&F, "OMP100", Remark); | ||||||
1869 | |||||||
1870 | return nullptr; | ||||||
1871 | } | ||||||
1872 | } | ||||||
1873 | |||||||
1874 | auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel { | ||||||
1875 | if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) { | ||||||
1876 | // Allow use in equality comparisons. | ||||||
1877 | if (Cmp->isEquality()) | ||||||
1878 | return getUniqueKernelFor(*Cmp); | ||||||
1879 | return nullptr; | ||||||
1880 | } | ||||||
1881 | if (auto *CB = dyn_cast<CallBase>(U.getUser())) { | ||||||
1882 | // Allow direct calls. | ||||||
1883 | if (CB->isCallee(&U)) | ||||||
1884 | return getUniqueKernelFor(*CB); | ||||||
1885 | |||||||
1886 | OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI = | ||||||
1887 | OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51]; | ||||||
1888 | // Allow the use in __kmpc_parallel_51 calls. | ||||||
1889 | if (OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI)) | ||||||
1890 | return getUniqueKernelFor(*CB); | ||||||
1891 | return nullptr; | ||||||
1892 | } | ||||||
1893 | // Disallow every other use. | ||||||
1894 | return nullptr; | ||||||
1895 | }; | ||||||
1896 | |||||||
1897 | // TODO: In the future we want to track more than just a unique kernel. | ||||||
1898 | SmallPtrSet<Kernel, 2> PotentialKernels; | ||||||
1899 | OMPInformationCache::foreachUse(F, [&](const Use &U) { | ||||||
1900 | PotentialKernels.insert(GetUniqueKernelForUse(U)); | ||||||
1901 | }); | ||||||
1902 | |||||||
1903 | Kernel K = nullptr; | ||||||
1904 | if (PotentialKernels.size() == 1) | ||||||
1905 | K = *PotentialKernels.begin(); | ||||||
1906 | |||||||
1907 | // Cache the result. | ||||||
1908 | UniqueKernelMap[&F] = K; | ||||||
1909 | |||||||
1910 | return K; | ||||||
1911 | } | ||||||
1912 | |||||||
1913 | bool OpenMPOpt::rewriteDeviceCodeStateMachine() { | ||||||
1914 | OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI = | ||||||
1915 | OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51]; | ||||||
1916 | |||||||
1917 | bool Changed = false; | ||||||
1918 | if (!KernelParallelRFI) | ||||||
1919 | return Changed; | ||||||
1920 | |||||||
1921 | for (Function *F : SCC) { | ||||||
1922 | |||||||
1923 | // Check if the function is a use in a __kmpc_parallel_51 call at | ||||||
1924 | // all. | ||||||
1925 | bool UnknownUse = false; | ||||||
1926 | bool KernelParallelUse = false; | ||||||
1927 | unsigned NumDirectCalls = 0; | ||||||
1928 | |||||||
1929 | SmallVector<Use *, 2> ToBeReplacedStateMachineUses; | ||||||
1930 | OMPInformationCache::foreachUse(*F, [&](Use &U) { | ||||||
1931 | if (auto *CB = dyn_cast<CallBase>(U.getUser())) | ||||||
1932 | if (CB->isCallee(&U)) { | ||||||
1933 | ++NumDirectCalls; | ||||||
1934 | return; | ||||||
1935 | } | ||||||
1936 | |||||||
1937 | if (isa<ICmpInst>(U.getUser())) { | ||||||
1938 | ToBeReplacedStateMachineUses.push_back(&U); | ||||||
1939 | return; | ||||||
1940 | } | ||||||
1941 | |||||||
1942 | // Find wrapper functions that represent parallel kernels. | ||||||
1943 | CallInst *CI = | ||||||
1944 | OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI); | ||||||
1945 | const unsigned int WrapperFunctionArgNo = 6; | ||||||
1946 | if (!KernelParallelUse && CI && | ||||||
1947 | CI->getArgOperandNo(&U) == WrapperFunctionArgNo) { | ||||||
1948 | KernelParallelUse = true; | ||||||
1949 | ToBeReplacedStateMachineUses.push_back(&U); | ||||||
1950 | return; | ||||||
1951 | } | ||||||
1952 | UnknownUse = true; | ||||||
1953 | }); | ||||||
1954 | |||||||
1955 | // Do not emit a remark if we haven't seen a __kmpc_parallel_51 | ||||||
1956 | // use. | ||||||
1957 | if (!KernelParallelUse) | ||||||
1958 | continue; | ||||||
1959 | |||||||
1960 | // If this ever hits, we should investigate. | ||||||
1961 | // TODO: Checking the number of uses is not a necessary restriction and | ||||||
1962 | // should be lifted. | ||||||
1963 | if (UnknownUse || NumDirectCalls != 1 || | ||||||
1964 | ToBeReplacedStateMachineUses.size() > 2) { | ||||||
1965 | auto Remark = [&](OptimizationRemarkAnalysis ORA) { | ||||||
1966 | return ORA << "Parallel region is used in " | ||||||
1967 | << (UnknownUse ? "unknown" : "unexpected") | ||||||
1968 | << " ways. Will not attempt to rewrite the state machine."; | ||||||
1969 | }; | ||||||
1970 | emitRemark<OptimizationRemarkAnalysis>(F, "OMP101", Remark); | ||||||
1971 | continue; | ||||||
1972 | } | ||||||
1973 | |||||||
1974 | // Even if we have __kmpc_parallel_51 calls, we (for now) give | ||||||
1975 | // up if the function is not called from a unique kernel. | ||||||
1976 | Kernel K = getUniqueKernelFor(*F); | ||||||
1977 | if (!K) { | ||||||
1978 | auto Remark = [&](OptimizationRemarkAnalysis ORA) { | ||||||
1979 | return ORA << "Parallel region is not called from a unique kernel. " | ||||||
1980 | "Will not attempt to rewrite the state machine."; | ||||||
1981 | }; | ||||||
1982 | emitRemark<OptimizationRemarkAnalysis>(F, "OMP102", Remark); | ||||||
1983 | continue; | ||||||
1984 | } | ||||||
1985 | |||||||
1986 | // We now know F is a parallel body function called only from the kernel K. | ||||||
1987 | // We also identified the state machine uses in which we replace the | ||||||
1988 | // function pointer by a new global symbol for identification purposes. This | ||||||
1989 | // ensures only direct calls to the function are left. | ||||||
1990 | |||||||
1991 | Module &M = *F->getParent(); | ||||||
1992 | Type *Int8Ty = Type::getInt8Ty(M.getContext()); | ||||||
1993 | |||||||
1994 | auto *ID = new GlobalVariable( | ||||||
1995 | M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage, | ||||||
1996 | UndefValue::get(Int8Ty), F->getName() + ".ID"); | ||||||
1997 | |||||||
1998 | for (Use *U : ToBeReplacedStateMachineUses) | ||||||
1999 | U->set(ConstantExpr::getPointerBitCastOrAddrSpaceCast( | ||||||
2000 | ID, U->get()->getType())); | ||||||
2001 | |||||||
2002 | ++NumOpenMPParallelRegionsReplacedInGPUStateMachine; | ||||||
2003 | |||||||
2004 | Changed = true; | ||||||
2005 | } | ||||||
2006 | |||||||
2007 | return Changed; | ||||||
2008 | } | ||||||
2009 | |||||||
2010 | /// Abstract Attribute for tracking ICV values. | ||||||
2011 | struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> { | ||||||
2012 | using Base = StateWrapper<BooleanState, AbstractAttribute>; | ||||||
2013 | AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {} | ||||||
2014 | |||||||
2015 | void initialize(Attributor &A) override { | ||||||
2016 | Function *F = getAnchorScope(); | ||||||
2017 | if (!F || !A.isFunctionIPOAmendable(*F)) | ||||||
2018 | indicatePessimisticFixpoint(); | ||||||
2019 | } | ||||||
2020 | |||||||
2021 | /// Returns true if value is assumed to be tracked. | ||||||
2022 | bool isAssumedTracked() const { return getAssumed(); } | ||||||
2023 | |||||||
2024 | /// Returns true if value is known to be tracked. | ||||||
2025 | bool isKnownTracked() const { return getAssumed(); } | ||||||
2026 | |||||||
2027 | /// Create an abstract attribute biew for the position \p IRP. | ||||||
2028 | static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A); | ||||||
2029 | |||||||
2030 | /// Return the value with which \p I can be replaced for specific \p ICV. | ||||||
2031 | virtual Optional<Value *> getReplacementValue(InternalControlVar ICV, | ||||||
2032 | const Instruction *I, | ||||||
2033 | Attributor &A) const { | ||||||
2034 | return None; | ||||||
2035 | } | ||||||
2036 | |||||||
2037 | /// Return an assumed unique ICV value if a single candidate is found. If | ||||||
2038 | /// there cannot be one, return a nullptr. If it is not clear yet, return the | ||||||
2039 | /// Optional::NoneType. | ||||||
2040 | virtual Optional<Value *> | ||||||
2041 | getUniqueReplacementValue(InternalControlVar ICV) const = 0; | ||||||
2042 | |||||||
2043 | // Currently only nthreads is being tracked. | ||||||
2044 | // this array will only grow with time. | ||||||
2045 | InternalControlVar TrackableICVs[1] = {ICV_nthreads}; | ||||||
2046 | |||||||
2047 | /// See AbstractAttribute::getName() | ||||||
2048 | const std::string getName() const override { return "AAICVTracker"; } | ||||||
2049 | |||||||
2050 | /// See AbstractAttribute::getIdAddr() | ||||||
2051 | const char *getIdAddr() const override { return &ID; } | ||||||
2052 | |||||||
2053 | /// This function should return true if the type of the \p AA is AAICVTracker | ||||||
2054 | static bool classof(const AbstractAttribute *AA) { | ||||||
2055 | return (AA->getIdAddr() == &ID); | ||||||
2056 | } | ||||||
2057 | |||||||
2058 | static const char ID; | ||||||
2059 | }; | ||||||
2060 | |||||||
2061 | struct AAICVTrackerFunction : public AAICVTracker { | ||||||
2062 | AAICVTrackerFunction(const IRPosition &IRP, Attributor &A) | ||||||
2063 | : AAICVTracker(IRP, A) {} | ||||||
2064 | |||||||
2065 | // FIXME: come up with better string. | ||||||
2066 | const std::string getAsStr() const override { return "ICVTrackerFunction"; } | ||||||
2067 | |||||||
2068 | // FIXME: come up with some stats. | ||||||
2069 | void trackStatistics() const override {} | ||||||
2070 | |||||||
2071 | /// We don't manifest anything for this AA. | ||||||
2072 | ChangeStatus manifest(Attributor &A) override { | ||||||
2073 | return ChangeStatus::UNCHANGED; | ||||||
2074 | } | ||||||
2075 | |||||||
2076 | // Map of ICV to their values at specific program point. | ||||||
2077 | EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar, | ||||||
2078 | InternalControlVar::ICV___last> | ||||||
2079 | ICVReplacementValuesMap; | ||||||
2080 | |||||||
2081 | ChangeStatus updateImpl(Attributor &A) override { | ||||||
2082 | ChangeStatus HasChanged = ChangeStatus::UNCHANGED; | ||||||
2083 | |||||||
2084 | Function *F = getAnchorScope(); | ||||||
2085 | |||||||
2086 | auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); | ||||||
2087 | |||||||
2088 | for (InternalControlVar ICV : TrackableICVs) { | ||||||
2089 | auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter]; | ||||||
2090 | |||||||
2091 | auto &ValuesMap = ICVReplacementValuesMap[ICV]; | ||||||
2092 | auto TrackValues = [&](Use &U, Function &) { | ||||||
2093 | CallInst *CI = OpenMPOpt::getCallIfRegularCall(U); | ||||||
2094 | if (!CI) | ||||||
2095 | return false; | ||||||
2096 | |||||||
2097 | // FIXME: handle setters with more that 1 arguments. | ||||||
2098 | /// Track new value. | ||||||
2099 | if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second) | ||||||
2100 | HasChanged = ChangeStatus::CHANGED; | ||||||
2101 | |||||||
2102 | return false; | ||||||
2103 | }; | ||||||
2104 | |||||||
2105 | auto CallCheck = [&](Instruction &I) { | ||||||
2106 | Optional<Value *> ReplVal = getValueForCall(A, &I, ICV); | ||||||
2107 | if (ReplVal.hasValue() && | ||||||
2108 | ValuesMap.insert(std::make_pair(&I, *ReplVal)).second) | ||||||
2109 | HasChanged = ChangeStatus::CHANGED; | ||||||
2110 | |||||||
2111 | return true; | ||||||
2112 | }; | ||||||
2113 | |||||||
2114 | // Track all changes of an ICV. | ||||||
2115 | SetterRFI.foreachUse(TrackValues, F); | ||||||
2116 | |||||||
2117 | bool UsedAssumedInformation = false; | ||||||
2118 | A.checkForAllInstructions(CallCheck, *this, {Instruction::Call}, | ||||||
2119 | UsedAssumedInformation, | ||||||
2120 | /* CheckBBLivenessOnly */ true); | ||||||
2121 | |||||||
2122 | /// TODO: Figure out a way to avoid adding entry in | ||||||
2123 | /// ICVReplacementValuesMap | ||||||
2124 | Instruction *Entry = &F->getEntryBlock().front(); | ||||||
2125 | if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry)) | ||||||
2126 | ValuesMap.insert(std::make_pair(Entry, nullptr)); | ||||||
2127 | } | ||||||
2128 | |||||||
2129 | return HasChanged; | ||||||
2130 | } | ||||||
2131 | |||||||
2132 | /// Hepler to check if \p I is a call and get the value for it if it is | ||||||
2133 | /// unique. | ||||||
2134 | Optional<Value *> getValueForCall(Attributor &A, const Instruction *I, | ||||||
2135 | InternalControlVar &ICV) const { | ||||||
2136 | |||||||
2137 | const auto *CB = dyn_cast<CallBase>(I); | ||||||
2138 | if (!CB || CB->hasFnAttr("no_openmp") || | ||||||
2139 | CB->hasFnAttr("no_openmp_routines")) | ||||||
2140 | return None; | ||||||
2141 | |||||||
2142 | auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); | ||||||
2143 | auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter]; | ||||||
2144 | auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter]; | ||||||
2145 | Function *CalledFunction = CB->getCalledFunction(); | ||||||
2146 | |||||||
2147 | // Indirect call, assume ICV changes. | ||||||
2148 | if (CalledFunction == nullptr) | ||||||
2149 | return nullptr; | ||||||
2150 | if (CalledFunction == GetterRFI.Declaration) | ||||||
2151 | return None; | ||||||
2152 | if (CalledFunction == SetterRFI.Declaration) { | ||||||
2153 | if (ICVReplacementValuesMap[ICV].count(I)) | ||||||
2154 | return ICVReplacementValuesMap[ICV].lookup(I); | ||||||
2155 | |||||||
2156 | return nullptr; | ||||||
2157 | } | ||||||
2158 | |||||||
2159 | // Since we don't know, assume it changes the ICV. | ||||||
2160 | if (CalledFunction->isDeclaration()) | ||||||
2161 | return nullptr; | ||||||
2162 | |||||||
2163 | const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( | ||||||
2164 | *this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED); | ||||||
2165 | |||||||
2166 | if (ICVTrackingAA.isAssumedTracked()) | ||||||
2167 | return ICVTrackingAA.getUniqueReplacementValue(ICV); | ||||||
2168 | |||||||
2169 | // If we don't know, assume it changes. | ||||||
2170 | return nullptr; | ||||||
2171 | } | ||||||
2172 | |||||||
2173 | // We don't check unique value for a function, so return None. | ||||||
2174 | Optional<Value *> | ||||||
2175 | getUniqueReplacementValue(InternalControlVar ICV) const override { | ||||||
2176 | return None; | ||||||
2177 | } | ||||||
2178 | |||||||
2179 | /// Return the value with which \p I can be replaced for specific \p ICV. | ||||||
2180 | Optional<Value *> getReplacementValue(InternalControlVar ICV, | ||||||
2181 | const Instruction *I, | ||||||
2182 | Attributor &A) const override { | ||||||
2183 | const auto &ValuesMap = ICVReplacementValuesMap[ICV]; | ||||||
2184 | if (ValuesMap.count(I)) | ||||||
2185 | return ValuesMap.lookup(I); | ||||||
2186 | |||||||
2187 | SmallVector<const Instruction *, 16> Worklist; | ||||||
2188 | SmallPtrSet<const Instruction *, 16> Visited; | ||||||
2189 | Worklist.push_back(I); | ||||||
2190 | |||||||
2191 | Optional<Value *> ReplVal; | ||||||
2192 | |||||||
2193 | while (!Worklist.empty()) { | ||||||
2194 | const Instruction *CurrInst = Worklist.pop_back_val(); | ||||||
2195 | if (!Visited.insert(CurrInst).second) | ||||||
2196 | continue; | ||||||
2197 | |||||||
2198 | const BasicBlock *CurrBB = CurrInst->getParent(); | ||||||
2199 | |||||||
2200 | // Go up and look for all potential setters/calls that might change the | ||||||
2201 | // ICV. | ||||||
2202 | while ((CurrInst = CurrInst->getPrevNode())) { | ||||||
2203 | if (ValuesMap.count(CurrInst)) { | ||||||
2204 | Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst); | ||||||
2205 | // Unknown value, track new. | ||||||
2206 | if (!ReplVal.hasValue()) { | ||||||
2207 | ReplVal = NewReplVal; | ||||||
2208 | break; | ||||||
2209 | } | ||||||
2210 | |||||||
2211 | // If we found a new value, we can't know the icv value anymore. | ||||||
2212 | if (NewReplVal.hasValue()) | ||||||
2213 | if (ReplVal != NewReplVal) | ||||||
2214 | return nullptr; | ||||||
2215 | |||||||
2216 | break; | ||||||
2217 | } | ||||||
2218 | |||||||
2219 | Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV); | ||||||
2220 | if (!NewReplVal.hasValue()) | ||||||
2221 | continue; | ||||||
2222 | |||||||
2223 | // Unknown value, track new. | ||||||
2224 | if (!ReplVal.hasValue()) { | ||||||
2225 | ReplVal = NewReplVal; | ||||||
2226 | break; | ||||||
2227 | } | ||||||
2228 | |||||||
2229 | // if (NewReplVal.hasValue()) | ||||||
2230 | // We found a new value, we can't know the icv value anymore. | ||||||
2231 | if (ReplVal != NewReplVal) | ||||||
2232 | return nullptr; | ||||||
2233 | } | ||||||
2234 | |||||||
2235 | // If we are in the same BB and we have a value, we are done. | ||||||
2236 | if (CurrBB == I->getParent() && ReplVal.hasValue()) | ||||||
2237 | return ReplVal; | ||||||
2238 | |||||||
2239 | // Go through all predecessors and add terminators for analysis. | ||||||
2240 | for (const BasicBlock *Pred : predecessors(CurrBB)) | ||||||
2241 | if (const Instruction *Terminator = Pred->getTerminator()) | ||||||
2242 | Worklist.push_back(Terminator); | ||||||
2243 | } | ||||||
2244 | |||||||
2245 | return ReplVal; | ||||||
2246 | } | ||||||
2247 | }; | ||||||
2248 | |||||||
2249 | struct AAICVTrackerFunctionReturned : AAICVTracker { | ||||||
2250 | AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A) | ||||||
2251 | : AAICVTracker(IRP, A) {} | ||||||
2252 | |||||||
2253 | // FIXME: come up with better string. | ||||||
2254 | const std::string getAsStr() const override { | ||||||
2255 | return "ICVTrackerFunctionReturned"; | ||||||
2256 | } | ||||||
2257 | |||||||
2258 | // FIXME: come up with some stats. | ||||||
2259 | void trackStatistics() const override {} | ||||||
2260 | |||||||
2261 | /// We don't manifest anything for this AA. | ||||||
2262 | ChangeStatus manifest(Attributor &A) override { | ||||||
2263 | return ChangeStatus::UNCHANGED; | ||||||
2264 | } | ||||||
2265 | |||||||
2266 | // Map of ICV to their values at specific program point. | ||||||
2267 | EnumeratedArray<Optional<Value *>, InternalControlVar, | ||||||
2268 | InternalControlVar::ICV___last> | ||||||
2269 | ICVReplacementValuesMap; | ||||||
2270 | |||||||
2271 | /// Return the value with which \p I can be replaced for specific \p ICV. | ||||||
2272 | Optional<Value *> | ||||||
2273 | getUniqueReplacementValue(InternalControlVar ICV) const override { | ||||||
2274 | return ICVReplacementValuesMap[ICV]; | ||||||
2275 | } | ||||||
2276 | |||||||
2277 | ChangeStatus updateImpl(Attributor &A) override { | ||||||
2278 | ChangeStatus Changed = ChangeStatus::UNCHANGED; | ||||||
2279 | const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( | ||||||
2280 | *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); | ||||||
2281 | |||||||
2282 | if (!ICVTrackingAA.isAssumedTracked()) | ||||||
2283 | return indicatePessimisticFixpoint(); | ||||||
2284 | |||||||
2285 | for (InternalControlVar ICV : TrackableICVs) { | ||||||
2286 | Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV]; | ||||||
2287 | Optional<Value *> UniqueICVValue; | ||||||
2288 | |||||||
2289 | auto CheckReturnInst = [&](Instruction &I) { | ||||||
2290 | Optional<Value *> NewReplVal = | ||||||
2291 | ICVTrackingAA.getReplacementValue(ICV, &I, A); | ||||||
2292 | |||||||
2293 | // If we found a second ICV value there is no unique returned value. | ||||||
2294 | if (UniqueICVValue.hasValue() && UniqueICVValue != NewReplVal) | ||||||
2295 | return false; | ||||||
2296 | |||||||
2297 | UniqueICVValue = NewReplVal; | ||||||
2298 | |||||||
2299 | return true; | ||||||
2300 | }; | ||||||
2301 | |||||||
2302 | bool UsedAssumedInformation = false; | ||||||
2303 | if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret}, | ||||||
2304 | UsedAssumedInformation, | ||||||
2305 | /* CheckBBLivenessOnly */ true)) | ||||||
2306 | UniqueICVValue = nullptr; | ||||||
2307 | |||||||
2308 | if (UniqueICVValue == ReplVal) | ||||||
2309 | continue; | ||||||
2310 | |||||||
2311 | ReplVal = UniqueICVValue; | ||||||
2312 | Changed = ChangeStatus::CHANGED; | ||||||
2313 | } | ||||||
2314 | |||||||
2315 | return Changed; | ||||||
2316 | } | ||||||
2317 | }; | ||||||
2318 | |||||||
2319 | struct AAICVTrackerCallSite : AAICVTracker { | ||||||
2320 | AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A) | ||||||
2321 | : AAICVTracker(IRP, A) {} | ||||||
2322 | |||||||
2323 | void initialize(Attributor &A) override { | ||||||
2324 | Function *F = getAnchorScope(); | ||||||
2325 | if (!F || !A.isFunctionIPOAmendable(*F)) | ||||||
2326 | indicatePessimisticFixpoint(); | ||||||
2327 | |||||||
2328 | // We only initialize this AA for getters, so we need to know which ICV it | ||||||
2329 | // gets. | ||||||
2330 | auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); | ||||||
2331 | for (InternalControlVar ICV : TrackableICVs) { | ||||||
2332 | auto ICVInfo = OMPInfoCache.ICVs[ICV]; | ||||||
2333 | auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter]; | ||||||
2334 | if (Getter.Declaration == getAssociatedFunction()) { | ||||||
2335 | AssociatedICV = ICVInfo.Kind; | ||||||
2336 | return; | ||||||
2337 | } | ||||||
2338 | } | ||||||
2339 | |||||||
2340 | /// Unknown ICV. | ||||||
2341 | indicatePessimisticFixpoint(); | ||||||
2342 | } | ||||||
2343 | |||||||
2344 | ChangeStatus manifest(Attributor &A) override { | ||||||
2345 | if (!ReplVal.hasValue() || !ReplVal.getValue()) | ||||||
2346 | return ChangeStatus::UNCHANGED; | ||||||
2347 | |||||||
2348 | A.changeValueAfterManifest(*getCtxI(), **ReplVal); | ||||||
2349 | A.deleteAfterManifest(*getCtxI()); | ||||||
2350 | |||||||
2351 | return ChangeStatus::CHANGED; | ||||||
2352 | } | ||||||
2353 | |||||||
2354 | // FIXME: come up with better string. | ||||||
2355 | const std::string getAsStr() const override { return "ICVTrackerCallSite"; } | ||||||
2356 | |||||||
2357 | // FIXME: come up with some stats. | ||||||
2358 | void trackStatistics() const override {} | ||||||
2359 | |||||||
2360 | InternalControlVar AssociatedICV; | ||||||
2361 | Optional<Value *> ReplVal; | ||||||
2362 | |||||||
2363 | ChangeStatus updateImpl(Attributor &A) override { | ||||||
2364 | const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( | ||||||
2365 | *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); | ||||||
2366 | |||||||
2367 | // We don't have any information, so we assume it changes the ICV. | ||||||
2368 | if (!ICVTrackingAA.isAssumedTracked()) | ||||||
2369 | return indicatePessimisticFixpoint(); | ||||||
2370 | |||||||
2371 | Optional<Value *> NewReplVal = | ||||||
2372 | ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A); | ||||||
2373 | |||||||
2374 | if (ReplVal == NewReplVal) | ||||||
2375 | return ChangeStatus::UNCHANGED; | ||||||
2376 | |||||||
2377 | ReplVal = NewReplVal; | ||||||
2378 | return ChangeStatus::CHANGED; | ||||||
2379 | } | ||||||
2380 | |||||||
2381 | // Return the value with which associated value can be replaced for specific | ||||||
2382 | // \p ICV. | ||||||
2383 | Optional<Value *> | ||||||
2384 | getUniqueReplacementValue(InternalControlVar ICV) const override { | ||||||
2385 | return ReplVal; | ||||||
2386 | } | ||||||
2387 | }; | ||||||
2388 | |||||||
2389 | struct AAICVTrackerCallSiteReturned : AAICVTracker { | ||||||
2390 | AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A) | ||||||
2391 | : AAICVTracker(IRP, A) {} | ||||||
2392 | |||||||
2393 | // FIXME: come up with better string. | ||||||
2394 | const std::string getAsStr() const override { | ||||||
2395 | return "ICVTrackerCallSiteReturned"; | ||||||
2396 | } | ||||||
2397 | |||||||
2398 | // FIXME: come up with some stats. | ||||||
2399 | void trackStatistics() const override {} | ||||||
2400 | |||||||
2401 | /// We don't manifest anything for this AA. | ||||||
2402 | ChangeStatus manifest(Attributor &A) override { | ||||||
2403 | return ChangeStatus::UNCHANGED; | ||||||
2404 | } | ||||||
2405 | |||||||
2406 | // Map of ICV to their values at specific program point. | ||||||
2407 | EnumeratedArray<Optional<Value *>, InternalControlVar, | ||||||
2408 | InternalControlVar::ICV___last> | ||||||
2409 | ICVReplacementValuesMap; | ||||||
2410 | |||||||
2411 | /// Return the value with which associated value can be replaced for specific | ||||||
2412 | /// \p ICV. | ||||||
2413 | Optional<Value *> | ||||||
2414 | getUniqueReplacementValue(InternalControlVar ICV) const override { | ||||||
2415 | return ICVReplacementValuesMap[ICV]; | ||||||
2416 | } | ||||||
2417 | |||||||
2418 | ChangeStatus updateImpl(Attributor &A) override { | ||||||
2419 | ChangeStatus Changed = ChangeStatus::UNCHANGED; | ||||||
2420 | const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>( | ||||||
2421 | *this, IRPosition::returned(*getAssociatedFunction()), | ||||||
2422 | DepClassTy::REQUIRED); | ||||||
2423 | |||||||
2424 | // We don't have any information, so we assume it changes the ICV. | ||||||
2425 | if (!ICVTrackingAA.isAssumedTracked()) | ||||||
2426 | return indicatePessimisticFixpoint(); | ||||||
2427 | |||||||
2428 | for (InternalControlVar ICV : TrackableICVs) { | ||||||
2429 | Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV]; | ||||||
2430 | Optional<Value *> NewReplVal = | ||||||
2431 | ICVTrackingAA.getUniqueReplacementValue(ICV); | ||||||
2432 | |||||||
2433 | if (ReplVal == NewReplVal) | ||||||
2434 | continue; | ||||||
2435 | |||||||
2436 | ReplVal = NewReplVal; | ||||||
2437 | Changed = ChangeStatus::CHANGED; | ||||||
2438 | } | ||||||
2439 | return Changed; | ||||||
2440 | } | ||||||
2441 | }; | ||||||
2442 | |||||||
2443 | struct AAExecutionDomainFunction : public AAExecutionDomain { | ||||||
2444 | AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A) | ||||||
2445 | : AAExecutionDomain(IRP, A) {} | ||||||
2446 | |||||||
2447 | const std::string getAsStr() const override { | ||||||
2448 | return "[AAExecutionDomain] " + std::to_string(SingleThreadedBBs.size()) + | ||||||
2449 | "/" + std::to_string(NumBBs) + " BBs thread 0 only."; | ||||||
2450 | } | ||||||
2451 | |||||||
2452 | /// See AbstractAttribute::trackStatistics(). | ||||||
2453 | void trackStatistics() const override {} | ||||||
2454 | |||||||
2455 | void initialize(Attributor &A) override { | ||||||
2456 | Function *F = getAnchorScope(); | ||||||
2457 | for (const auto &BB : *F) | ||||||
2458 | SingleThreadedBBs.insert(&BB); | ||||||
2459 | NumBBs = SingleThreadedBBs.size(); | ||||||
2460 | } | ||||||
2461 | |||||||
2462 | ChangeStatus manifest(Attributor &A) override { | ||||||
2463 | LLVM_DEBUG({do { } while (false) | ||||||
2464 | for (const BasicBlock *BB : SingleThreadedBBs)do { } while (false) | ||||||
2465 | dbgs() << TAG << " Basic block @" << getAnchorScope()->getName() << " "do { } while (false) | ||||||
2466 | << BB->getName() << " is executed by a single thread.\n";do { } while (false) | ||||||
2467 | })do { } while (false); | ||||||
2468 | return ChangeStatus::UNCHANGED; | ||||||
2469 | } | ||||||
2470 | |||||||
2471 | ChangeStatus updateImpl(Attributor &A) override; | ||||||
2472 | |||||||
2473 | /// Check if an instruction is executed by a single thread. | ||||||
2474 | bool isExecutedByInitialThreadOnly(const Instruction &I) const override { | ||||||
2475 | return isExecutedByInitialThreadOnly(*I.getParent()); | ||||||
2476 | } | ||||||
2477 | |||||||
2478 | bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override { | ||||||
2479 | return isValidState() && SingleThreadedBBs.contains(&BB); | ||||||
2480 | } | ||||||
2481 | |||||||
2482 | /// Set of basic blocks that are executed by a single thread. | ||||||
2483 | DenseSet<const BasicBlock *> SingleThreadedBBs; | ||||||
2484 | |||||||
2485 | /// Total number of basic blocks in this function. | ||||||
2486 | long unsigned NumBBs; | ||||||
2487 | }; | ||||||
2488 | |||||||
2489 | ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { | ||||||
2490 | Function *F = getAnchorScope(); | ||||||
2491 | ReversePostOrderTraversal<Function *> RPOT(F); | ||||||
2492 | auto NumSingleThreadedBBs = SingleThreadedBBs.size(); | ||||||
2493 | |||||||
2494 | bool AllCallSitesKnown; | ||||||
2495 | auto PredForCallSite = [&](AbstractCallSite ACS) { | ||||||
2496 | const auto &ExecutionDomainAA = A.getAAFor<AAExecutionDomain>( | ||||||
2497 | *this, IRPosition::function(*ACS.getInstruction()->getFunction()), | ||||||
2498 | DepClassTy::REQUIRED); | ||||||
2499 | return ACS.isDirectCall() && | ||||||
2500 | ExecutionDomainAA.isExecutedByInitialThreadOnly( | ||||||
2501 | *ACS.getInstruction()); | ||||||
2502 | }; | ||||||
2503 | |||||||
2504 | if (!A.checkForAllCallSites(PredForCallSite, *this, | ||||||
2505 | /* RequiresAllCallSites */ true, | ||||||
2506 | AllCallSitesKnown)) | ||||||
2507 | SingleThreadedBBs.erase(&F->getEntryBlock()); | ||||||
2508 | |||||||
2509 | auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); | ||||||
2510 | auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; | ||||||
2511 | |||||||
2512 | // Check if the edge into the successor block compares the __kmpc_target_init | ||||||
2513 | // result with -1. If we are in non-SPMD-mode that signals only the main | ||||||
2514 | // thread will execute the edge. | ||||||
2515 | auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) { | ||||||
2516 | if (!Edge || !Edge->isConditional()) | ||||||
2517 | return false; | ||||||
2518 | if (Edge->getSuccessor(0) != SuccessorBB) | ||||||
2519 | return false; | ||||||
2520 | |||||||
2521 | auto *Cmp = dyn_cast<CmpInst>(Edge->getCondition()); | ||||||
2522 | if (!Cmp || !Cmp->isTrueWhenEqual() || !Cmp->isEquality()) | ||||||
2523 | return false; | ||||||
2524 | |||||||
2525 | ConstantInt *C = dyn_cast<ConstantInt>(Cmp->getOperand(1)); | ||||||
2526 | if (!C) | ||||||
2527 | return false; | ||||||
2528 | |||||||
2529 | // Match: -1 == __kmpc_target_init (for non-SPMD kernels only!) | ||||||
2530 | if (C->isAllOnesValue()) { | ||||||
2531 | auto *CB = dyn_cast<CallBase>(Cmp->getOperand(0)); | ||||||
2532 | CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr; | ||||||
2533 | if (!CB) | ||||||
2534 | return false; | ||||||
2535 | const int InitIsSPMDArgNo = 1; | ||||||
2536 | auto *IsSPMDModeCI = | ||||||
2537 | dyn_cast<ConstantInt>(CB->getOperand(InitIsSPMDArgNo)); | ||||||
2538 | return IsSPMDModeCI && IsSPMDModeCI->isZero(); | ||||||
2539 | } | ||||||
2540 | |||||||
2541 | return false; | ||||||
2542 | }; | ||||||
2543 | |||||||
2544 | // Merge all the predecessor states into the current basic block. A basic | ||||||
2545 | // block is executed by a single thread if all of its predecessors are. | ||||||
2546 | auto MergePredecessorStates = [&](BasicBlock *BB) { | ||||||
2547 | if (pred_begin(BB) == pred_end(BB)) | ||||||
2548 | return SingleThreadedBBs.contains(BB); | ||||||
2549 | |||||||
2550 | bool IsInitialThread = true; | ||||||
2551 | for (auto PredBB = pred_begin(BB), PredEndBB = pred_end(BB); | ||||||
2552 | PredBB != PredEndBB; ++PredBB) { | ||||||
2553 | if (!IsInitialThreadOnly(dyn_cast<BranchInst>((*PredBB)->getTerminator()), | ||||||
2554 | BB)) | ||||||
2555 | IsInitialThread &= SingleThreadedBBs.contains(*PredBB); | ||||||
2556 | } | ||||||
2557 | |||||||
2558 | return IsInitialThread; | ||||||
2559 | }; | ||||||
2560 | |||||||
2561 | for (auto *BB : RPOT) { | ||||||
2562 | if (!MergePredecessorStates(BB)) | ||||||
2563 | SingleThreadedBBs.erase(BB); | ||||||
2564 | } | ||||||
2565 | |||||||
2566 | return (NumSingleThreadedBBs == SingleThreadedBBs.size()) | ||||||
2567 | ? ChangeStatus::UNCHANGED | ||||||
2568 | : ChangeStatus::CHANGED; | ||||||
2569 | } | ||||||
2570 | |||||||
2571 | /// Try to replace memory allocation calls called by a single thread with a | ||||||
2572 | /// static buffer of shared memory. | ||||||
2573 | struct AAHeapToShared : public StateWrapper<BooleanState, AbstractAttribute> { | ||||||
2574 | using Base = StateWrapper<BooleanState, AbstractAttribute>; | ||||||
2575 | AAHeapToShared(const IRPosition &IRP, Attributor &A) : Base(IRP) {} | ||||||
2576 | |||||||
2577 | /// Create an abstract attribute view for the position \p IRP. | ||||||
2578 | static AAHeapToShared &createForPosition(const IRPosition &IRP, | ||||||
2579 | Attributor &A); | ||||||
2580 | |||||||
2581 | /// Returns true if HeapToShared conversion is assumed to be possible. | ||||||
2582 | virtual bool isAssumedHeapToShared(CallBase &CB) const = 0; | ||||||
2583 | |||||||
2584 | /// Returns true if HeapToShared conversion is assumed and the CB is a | ||||||
2585 | /// callsite to a free operation to be removed. | ||||||
2586 | virtual bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const = 0; | ||||||
2587 | |||||||
2588 | /// See AbstractAttribute::getName(). | ||||||
2589 | const std::string getName() const override { return "AAHeapToShared"; } | ||||||
2590 | |||||||
2591 | /// See AbstractAttribute::getIdAddr(). | ||||||
2592 | const char *getIdAddr() const override { return &ID; } | ||||||
2593 | |||||||
2594 | /// This function should return true if the type of the \p AA is | ||||||
2595 | /// AAHeapToShared. | ||||||
2596 | static bool classof(const AbstractAttribute *AA) { | ||||||
2597 | return (AA->getIdAddr() == &ID); | ||||||
2598 | } | ||||||
2599 | |||||||
2600 | /// Unique ID (due to the unique address) | ||||||
2601 | static const char ID; | ||||||
2602 | }; | ||||||
2603 | |||||||
2604 | struct AAHeapToSharedFunction : public AAHeapToShared { | ||||||
2605 | AAHeapToSharedFunction(const IRPosition &IRP, Attributor &A) | ||||||
2606 | : AAHeapToShared(IRP, A) {} | ||||||
2607 | |||||||
2608 | const std::string getAsStr() const override { | ||||||
2609 | return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) + | ||||||
2610 | " malloc calls eligible."; | ||||||
2611 | } | ||||||
2612 | |||||||
2613 | /// See AbstractAttribute::trackStatistics(). | ||||||
2614 | void trackStatistics() const override {} | ||||||
2615 | |||||||
2616 | /// This functions finds free calls that will be removed by the | ||||||
2617 | /// HeapToShared transformation. | ||||||
2618 | void findPotentialRemovedFreeCalls(Attributor &A) { | ||||||
2619 | auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); | ||||||
2620 | auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared]; | ||||||
2621 | |||||||
2622 | PotentialRemovedFreeCalls.clear(); | ||||||
2623 | // Update free call users of found malloc calls. | ||||||
2624 | for (CallBase *CB : MallocCalls) { | ||||||
2625 | SmallVector<CallBase *, 4> FreeCalls; | ||||||
2626 | for (auto *U : CB->users()) { | ||||||
2627 | CallBase *C = dyn_cast<CallBase>(U); | ||||||
2628 | if (C && C->getCalledFunction() == FreeRFI.Declaration) | ||||||
2629 | FreeCalls.push_back(C); | ||||||
2630 | } | ||||||
2631 | |||||||
2632 | if (FreeCalls.size() != 1) | ||||||
2633 | continue; | ||||||
2634 | |||||||
2635 | PotentialRemovedFreeCalls.insert(FreeCalls.front()); | ||||||
2636 | } | ||||||
2637 | } | ||||||
2638 | |||||||
2639 | void initialize(Attributor &A) override { | ||||||
2640 | auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); | ||||||
2641 | auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; | ||||||
2642 | |||||||
2643 | for (User *U : RFI.Declaration->users()) | ||||||
2644 | if (CallBase *CB = dyn_cast<CallBase>(U)) | ||||||
2645 | MallocCalls.insert(CB); | ||||||
2646 | |||||||
2647 | findPotentialRemovedFreeCalls(A); | ||||||
2648 | } | ||||||
2649 | |||||||
2650 | bool isAssumedHeapToShared(CallBase &CB) const override { | ||||||
2651 | return isValidState() && MallocCalls.count(&CB); | ||||||
2652 | } | ||||||
2653 | |||||||
2654 | bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const override { | ||||||
2655 | return isValidState() && PotentialRemovedFreeCalls.count(&CB); | ||||||
2656 | } | ||||||
2657 | |||||||
2658 | ChangeStatus manifest(Attributor &A) override { | ||||||
2659 | if (MallocCalls.empty()) | ||||||
| |||||||
2660 | return ChangeStatus::UNCHANGED; | ||||||
2661 | |||||||
2662 | auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); | ||||||
2663 | auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared]; | ||||||
2664 | |||||||
2665 | Function *F = getAnchorScope(); | ||||||
2666 | auto *HS = A.lookupAAFor<AAHeapToStack>(IRPosition::function(*F), this, | ||||||
2667 | DepClassTy::OPTIONAL); | ||||||
2668 | |||||||
2669 | ChangeStatus Changed = ChangeStatus::UNCHANGED; | ||||||
2670 | for (CallBase *CB : MallocCalls) { | ||||||
2671 | // Skip replacing this if HeapToStack has already claimed it. | ||||||
2672 | if (HS
| ||||||
2673 | continue; | ||||||
2674 | |||||||
2675 | // Find the unique free call to remove it. | ||||||
2676 | SmallVector<CallBase *, 4> FreeCalls; | ||||||
2677 | for (auto *U : CB->users()) { | ||||||
2678 | CallBase *C = dyn_cast<CallBase>(U); | ||||||
2679 | if (C && C->getCalledFunction() == FreeCall.Declaration) | ||||||
2680 | FreeCalls.push_back(C); | ||||||
2681 | } | ||||||
2682 | if (FreeCalls.size() != 1) | ||||||
2683 | continue; | ||||||
2684 | |||||||
2685 | ConstantInt *AllocSize = dyn_cast<ConstantInt>(CB->getArgOperand(0)); | ||||||
2686 | |||||||
2687 | LLVM_DEBUG(dbgs() << TAG << "Replace globalization call in "do { } while (false) | ||||||
2688 | << CB->getCaller()->getName() << " with "do { } while (false) | ||||||
2689 | << AllocSize->getZExtValue()do { } while (false) | ||||||
2690 | << " bytes of shared memory\n")do { } while (false); | ||||||
2691 | |||||||
2692 | // Create a new shared memory buffer of the same size as the allocation | ||||||
2693 | // and replace all the uses of the original allocation with it. | ||||||
2694 | Module *M = CB->getModule(); | ||||||
2695 | Type *Int8Ty = Type::getInt8Ty(M->getContext()); | ||||||
2696 | Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue()); | ||||||
| |||||||
2697 | auto *SharedMem = new GlobalVariable( | ||||||
2698 | *M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage, | ||||||
2699 | UndefValue::get(Int8ArrTy), CB->getName(), nullptr, | ||||||
2700 | GlobalValue::NotThreadLocal, | ||||||
2701 | static_cast<unsigned>(AddressSpace::Shared)); | ||||||
2702 | auto *NewBuffer = | ||||||
2703 | ConstantExpr::getPointerCast(SharedMem, Int8Ty->getPointerTo()); | ||||||
2704 | |||||||
2705 | auto Remark = [&](OptimizationRemark OR) { | ||||||
2706 | return OR << "Replaced globalized variable with " | ||||||
2707 | << ore::NV("SharedMemory", AllocSize->getZExtValue()) | ||||||
2708 | << ((AllocSize->getZExtValue() != 1) ? " bytes " : " byte ") | ||||||
2709 | << "of shared memory."; | ||||||
2710 | }; | ||||||
2711 | A.emitRemark<OptimizationRemark>(CB, "OMP111", Remark); | ||||||
2712 | |||||||
2713 | SharedMem->setAlignment(MaybeAlign(32)); | ||||||
2714 | |||||||
2715 | A.changeValueAfterManifest(*CB, *NewBuffer); | ||||||
2716 | A.deleteAfterManifest(*CB); | ||||||
2717 | A.deleteAfterManifest(*FreeCalls.front()); | ||||||
2718 | |||||||
2719 | NumBytesMovedToSharedMemory += AllocSize->getZExtValue(); | ||||||
2720 | Changed = ChangeStatus::CHANGED; | ||||||
2721 | } | ||||||
2722 | |||||||
2723 | return Changed; | ||||||
2724 | } | ||||||
2725 | |||||||
2726 | ChangeStatus updateImpl(Attributor &A) override { | ||||||
2727 | auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); | ||||||
2728 | auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; | ||||||
2729 | Function *F = getAnchorScope(); | ||||||
2730 | |||||||
2731 | auto NumMallocCalls = MallocCalls.size(); | ||||||
2732 | |||||||
2733 | // Only consider malloc calls executed by a single thread with a constant. | ||||||
2734 | for (User *U : RFI.Declaration->users()) { | ||||||
2735 | const auto &ED = A.getAAFor<AAExecutionDomain>( | ||||||
2736 | *this, IRPosition::function(*F), DepClassTy::REQUIRED); | ||||||
2737 | if (CallBase *CB = dyn_cast<CallBase>(U)) | ||||||
2738 | if (!dyn_cast<ConstantInt>(CB->getArgOperand(0)) || | ||||||
2739 | !ED.isExecutedByInitialThreadOnly(*CB)) | ||||||
2740 | MallocCalls.erase(CB); | ||||||
2741 | } | ||||||
2742 | |||||||
2743 | findPotentialRemovedFreeCalls(A); | ||||||
2744 | |||||||
2745 | if (NumMallocCalls != MallocCalls.size()) | ||||||
2746 | return ChangeStatus::CHANGED; | ||||||
2747 | |||||||
2748 | return ChangeStatus::UNCHANGED; | ||||||
2749 | } | ||||||
2750 | |||||||
2751 | /// Collection of all malloc calls in a function. | ||||||
2752 | SmallPtrSet<CallBase *, 4> MallocCalls; | ||||||
2753 | /// Collection of potentially removed free calls in a function. | ||||||
2754 | SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls; | ||||||
2755 | }; | ||||||
2756 | |||||||
2757 | struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> { | ||||||
2758 | using Base = StateWrapper<KernelInfoState, AbstractAttribute>; | ||||||
2759 | AAKernelInfo(const IRPosition &IRP, Attributor &A) : Base(IRP) {} | ||||||
2760 | |||||||
2761 | /// Statistics are tracked as part of manifest for now. | ||||||
2762 | void trackStatistics() const override {} | ||||||
2763 | |||||||
2764 | /// See AbstractAttribute::getAsStr() | ||||||
2765 | const std::string getAsStr() const override { | ||||||
2766 | if (!isValidState()) | ||||||
2767 | return "<invalid>"; | ||||||
2768 | return std::string(SPMDCompatibilityTracker.isAssumed() ? "SPMD" | ||||||
2769 | : "generic") + | ||||||
2770 | std::string(SPMDCompatibilityTracker.isAtFixpoint() ? " [FIX]" | ||||||
2771 | : "") + | ||||||
2772 | std::string(" #PRs: ") + | ||||||
2773 | std::to_string(ReachedKnownParallelRegions.size()) + | ||||||
2774 | ", #Unknown PRs: " + | ||||||
2775 | std::to_string(ReachedUnknownParallelRegions.size()); | ||||||
2776 | } | ||||||
2777 | |||||||
2778 | /// Create an abstract attribute biew for the position \p IRP. | ||||||
2779 | static AAKernelInfo &createForPosition(const IRPosition &IRP, Attributor &A); | ||||||
2780 | |||||||
2781 | /// See AbstractAttribute::getName() | ||||||
2782 | const std::string getName() const override { return "AAKernelInfo"; } | ||||||
2783 | |||||||
2784 | /// See AbstractAttribute::getIdAddr() | ||||||
2785 | const char *getIdAddr() const override { return &ID; } | ||||||
2786 | |||||||
2787 | /// This function should return true if the type of the \p AA is AAKernelInfo | ||||||
2788 | static bool classof(const AbstractAttribute *AA) { | ||||||
2789 | return (AA->getIdAddr() == &ID); | ||||||
2790 | } | ||||||
2791 | |||||||
2792 | static const char ID; | ||||||
2793 | }; | ||||||
2794 | |||||||
2795 | /// The function kernel info abstract attribute, basically, what can we say | ||||||
2796 | /// about a function with regards to the KernelInfoState. | ||||||
2797 | struct AAKernelInfoFunction : AAKernelInfo { | ||||||
2798 | AAKernelInfoFunction(const IRPosition &IRP, Attributor &A) | ||||||
2799 | : AAKernelInfo(IRP, A) {} | ||||||
2800 | |||||||
2801 | /// See AbstractAttribute::initialize(...). | ||||||
2802 | void initialize(Attributor &A) override { | ||||||
2803 | // This is a high-level transform that might change the constant arguments | ||||||
2804 | // of the init and dinit calls. We need to tell the Attributor about this | ||||||
2805 | // to avoid other parts using the current constant value for simpliication. | ||||||
2806 | auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); | ||||||
2807 | |||||||
2808 | Function *Fn = getAnchorScope(); | ||||||
2809 | if (!OMPInfoCache.Kernels.count(Fn)) | ||||||
2810 | return; | ||||||
2811 | |||||||
2812 | // Add itself to the reaching kernel and set IsKernelEntry. | ||||||
2813 | ReachingKernelEntries.insert(Fn); | ||||||
2814 | IsKernelEntry = true; | ||||||
2815 | |||||||
2816 | OMPInformationCache::RuntimeFunctionInfo &InitRFI = | ||||||
2817 | OMPInfoCache.RFIs[OMPRTL___kmpc_target_init]; | ||||||
2818 | OMPInformationCache::RuntimeFunctionInfo &DeinitRFI = | ||||||
2819 | OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit]; | ||||||
2820 | |||||||
2821 | // For kernels we perform more initialization work, first we find the init | ||||||
2822 | // and deinit calls. | ||||||
2823 | auto StoreCallBase = [](Use &U, | ||||||
2824 | OMPInformationCache::RuntimeFunctionInfo &RFI, | ||||||
2825 | CallBase *&Storage) { | ||||||
2826 | CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI); | ||||||
2827 | assert(CB &&((void)0) | ||||||
2828 | "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!")((void)0); | ||||||
2829 | assert(!Storage &&((void)0) | ||||||
2830 | "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!")((void)0); | ||||||
2831 | Storage = CB; | ||||||
2832 | return false; | ||||||
2833 | }; | ||||||
2834 | InitRFI.foreachUse( | ||||||
2835 | [&](Use &U, Function &) { | ||||||
2836 | StoreCallBase(U, InitRFI, KernelInitCB); | ||||||
2837 | return false; | ||||||
2838 | }, | ||||||
2839 | Fn); | ||||||
2840 | DeinitRFI.foreachUse( | ||||||
2841 | [&](Use &U, Function &) { | ||||||
2842 | StoreCallBase(U, DeinitRFI, KernelDeinitCB); | ||||||
2843 | return false; | ||||||
2844 | }, | ||||||
2845 | Fn); | ||||||
2846 | |||||||
2847 | assert((KernelInitCB && KernelDeinitCB) &&((void)0) | ||||||
2848 | "Kernel without __kmpc_target_init or __kmpc_target_deinit!")((void)0); | ||||||
2849 | |||||||
2850 | // For kernels we might need to initialize/finalize the IsSPMD state and | ||||||
2851 | // we need to register a simplification callback so that the Attributor | ||||||
2852 | // knows the constant arguments to __kmpc_target_init and | ||||||
2853 | // __kmpc_target_deinit might actually change. | ||||||
2854 | |||||||
2855 | Attributor::SimplifictionCallbackTy StateMachineSimplifyCB = | ||||||
2856 | [&](const IRPosition &IRP, const AbstractAttribute *AA, | ||||||
2857 | bool &UsedAssumedInformation) -> Optional<Value *> { | ||||||
2858 | // IRP represents the "use generic state machine" argument of an | ||||||
2859 | // __kmpc_target_init call. We will answer this one with the internal | ||||||
2860 | // state. As long as we are not in an invalid state, we will create a | ||||||
2861 | // custom state machine so the value should be a `i1 false`. If we are | ||||||
2862 | // in an invalid state, we won't change the value that is in the IR. | ||||||
2863 | if (!isValidState()) | ||||||
2864 | return nullptr; | ||||||
2865 | if (AA) | ||||||
2866 | A.recordDependence(*this, *AA, DepClassTy::OPTIONAL); | ||||||
2867 | UsedAssumedInformation = !isAtFixpoint(); | ||||||
2868 | auto *FalseVal = | ||||||
2869 | ConstantInt::getBool(IRP.getAnchorValue().getContext(), 0); | ||||||
2870 | return FalseVal; | ||||||
2871 | }; | ||||||
2872 | |||||||
2873 | Attributor::SimplifictionCallbackTy IsSPMDModeSimplifyCB = | ||||||
2874 | [&](const IRPosition &IRP, const AbstractAttribute *AA, | ||||||
2875 | bool &UsedAssumedInformation) -> Optional<Value *> { | ||||||
2876 | // IRP represents the "SPMDCompatibilityTracker" argument of an | ||||||
2877 | // __kmpc_target_init or | ||||||
2878 | // __kmpc_target_deinit call. We will answer this one with the internal | ||||||
2879 | // state. | ||||||
2880 | if (!SPMDCompatibilityTracker.isValidState()) | ||||||
2881 | return nullptr; | ||||||
2882 | if (!SPMDCompatibilityTracker.isAtFixpoint()) { | ||||||
2883 | if (AA) | ||||||
2884 | A.recordDependence(*this, *AA, DepClassTy::OPTIONAL); | ||||||
2885 | UsedAssumedInformation = true; | ||||||
2886 | } else { | ||||||
2887 | UsedAssumedInformation = false; | ||||||
2888 | } | ||||||
2889 | auto *Val = ConstantInt::getBool(IRP.getAnchorValue().getContext(), | ||||||
2890 | SPMDCompatibilityTracker.isAssumed()); | ||||||
2891 | return Val; | ||||||
2892 | }; | ||||||
2893 | |||||||
2894 | Attributor::SimplifictionCallbackTy IsGenericModeSimplifyCB = | ||||||
2895 | [&](const IRPosition &IRP, const AbstractAttribute *AA, | ||||||
2896 | bool &UsedAssumedInformation) -> Optional<Value *> { | ||||||
2897 | // IRP represents the "RequiresFullRuntime" argument of an | ||||||
2898 | // __kmpc_target_init or __kmpc_target_deinit call. We will answer this | ||||||
2899 | // one with the internal state of the SPMDCompatibilityTracker, so if | ||||||
2900 | // generic then true, if SPMD then false. | ||||||
2901 | if (!SPMDCompatibilityTracker.isValidState()) | ||||||
2902 | return nullptr; | ||||||
2903 | if (!SPMDCompatibilityTracker.isAtFixpoint()) { | ||||||
2904 | if (AA) | ||||||
2905 | A.recordDependence(*this, *AA, DepClassTy::OPTIONAL); | ||||||
2906 | UsedAssumedInformation = true; | ||||||
2907 | } else { | ||||||
2908 | UsedAssumedInformation = false; | ||||||
2909 | } | ||||||
2910 | auto *Val = ConstantInt::getBool(IRP.getAnchorValue().getContext(), | ||||||
2911 | !SPMDCompatibilityTracker.isAssumed()); | ||||||
2912 | return Val; | ||||||
2913 | }; | ||||||
2914 | |||||||
2915 | constexpr const int InitIsSPMDArgNo = 1; | ||||||
2916 | constexpr const int DeinitIsSPMDArgNo = 1; | ||||||
2917 | constexpr const int InitUseStateMachineArgNo = 2; | ||||||
2918 | constexpr const int InitRequiresFullRuntimeArgNo = 3; | ||||||
2919 | constexpr const int DeinitRequiresFullRuntimeArgNo = 2; | ||||||
2920 | A.registerSimplificationCallback( | ||||||
2921 | IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo), | ||||||
2922 | StateMachineSimplifyCB); | ||||||
2923 | A.registerSimplificationCallback( | ||||||
2924 | IRPosition::callsite_argument(*KernelInitCB, InitIsSPMDArgNo), | ||||||
2925 | IsSPMDModeSimplifyCB); | ||||||
2926 | A.registerSimplificationCallback( | ||||||
2927 | IRPosition::callsite_argument(*KernelDeinitCB, DeinitIsSPMDArgNo), | ||||||
2928 | IsSPMDModeSimplifyCB); | ||||||
2929 | A.registerSimplificationCallback( | ||||||
2930 | IRPosition::callsite_argument(*KernelInitCB, | ||||||
2931 | InitRequiresFullRuntimeArgNo), | ||||||
2932 | IsGenericModeSimplifyCB); | ||||||
2933 | A.registerSimplificationCallback( | ||||||
2934 | IRPosition::callsite_argument(*KernelDeinitCB, | ||||||
2935 | DeinitRequiresFullRuntimeArgNo), | ||||||
2936 | IsGenericModeSimplifyCB); | ||||||
2937 | |||||||
2938 | // Check if we know we are in SPMD-mode already. | ||||||
2939 | ConstantInt *IsSPMDArg = | ||||||
2940 | dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitIsSPMDArgNo)); | ||||||
2941 | if (IsSPMDArg && !IsSPMDArg->isZero()) | ||||||
2942 | SPMDCompatibilityTracker.indicateOptimisticFixpoint(); | ||||||
2943 | } | ||||||
2944 | |||||||
2945 | /// Modify the IR based on the KernelInfoState as the fixpoint iteration is | ||||||
2946 | /// finished now. | ||||||
2947 | ChangeStatus manifest(Attributor &A) override { | ||||||
2948 | // If we are not looking at a kernel with __kmpc_target_init and | ||||||
2949 | // __kmpc_target_deinit call we cannot actually manifest the information. | ||||||
2950 | if (!KernelInitCB || !KernelDeinitCB) | ||||||
2951 | return ChangeStatus::UNCHANGED; | ||||||
2952 | |||||||
2953 | // Known SPMD-mode kernels need no manifest changes. | ||||||
2954 | if (SPMDCompatibilityTracker.isKnown()) | ||||||
2955 | return ChangeStatus::UNCHANGED; | ||||||
2956 | |||||||
2957 | // If we can we change the execution mode to SPMD-mode otherwise we build a | ||||||
2958 | // custom state machine. | ||||||
2959 | if (!changeToSPMDMode(A)) | ||||||
2960 | buildCustomStateMachine(A); | ||||||
2961 | |||||||
2962 | return ChangeStatus::CHANGED; | ||||||
2963 | } | ||||||
2964 | |||||||
2965 | bool changeToSPMDMode(Attributor &A) { | ||||||
2966 | auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); | ||||||
2967 | |||||||
2968 | if (!SPMDCompatibilityTracker.isAssumed()) { | ||||||
2969 | for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) { | ||||||
2970 | if (!NonCompatibleI) | ||||||
2971 | continue; | ||||||
2972 | |||||||
2973 | // Skip diagnostics on calls to known OpenMP runtime functions for now. | ||||||
2974 | if (auto *CB = dyn_cast<CallBase>(NonCompatibleI)) | ||||||
2975 | if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction())) | ||||||
2976 | continue; | ||||||
2977 | |||||||
2978 | auto Remark = [&](OptimizationRemarkAnalysis ORA) { | ||||||
2979 | ORA << "Value has potential side effects preventing SPMD-mode " | ||||||
2980 | "execution"; | ||||||
2981 | if (isa<CallBase>(NonCompatibleI)) { | ||||||
2982 | ORA << ". Add `__attribute__((assume(\"ompx_spmd_amenable\")))` to " | ||||||
2983 | "the called function to override"; | ||||||
2984 | } | ||||||
2985 | return ORA << "."; | ||||||
2986 | }; | ||||||
2987 | A.emitRemark<OptimizationRemarkAnalysis>(NonCompatibleI, "OMP121", | ||||||
2988 | Remark); | ||||||
2989 | |||||||
2990 | LLVM_DEBUG(dbgs() << TAG << "SPMD-incompatible side-effect: "do { } while (false) | ||||||
2991 | << *NonCompatibleI << "\n")do { } while (false); | ||||||
2992 | } | ||||||
2993 | |||||||
2994 | return false; | ||||||
2995 | } | ||||||
2996 | |||||||
2997 | // Adjust the global exec mode flag that tells the runtime what mode this | ||||||
2998 | // kernel is executed in. | ||||||
2999 | Function *Kernel = getAnchorScope(); | ||||||
3000 | GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable( | ||||||
3001 | (Kernel->getName() + "_exec_mode").str()); | ||||||
3002 | assert(ExecMode && "Kernel without exec mode?")((void)0); | ||||||
3003 | assert(ExecMode->getInitializer() &&((void)0) | ||||||
3004 | ExecMode->getInitializer()->isOneValue() &&((void)0) | ||||||
3005 | "Initially non-SPMD kernel has SPMD exec mode!")((void)0); | ||||||
3006 | |||||||
3007 | // Set the global exec mode flag to indicate SPMD-Generic mode. | ||||||
3008 | constexpr int SPMDGeneric = 2; | ||||||
3009 | if (!ExecMode->getInitializer()->isZeroValue()) | ||||||
3010 | ExecMode->setInitializer( | ||||||
3011 | ConstantInt::get(ExecMode->getInitializer()->getType(), SPMDGeneric)); | ||||||
3012 | |||||||
3013 | // Next rewrite the init and deinit calls to indicate we use SPMD-mode now. | ||||||
3014 | const int InitIsSPMDArgNo = 1; | ||||||
3015 | const int DeinitIsSPMDArgNo = 1; | ||||||
3016 | const int InitUseStateMachineArgNo = 2; | ||||||
3017 | const int InitRequiresFullRuntimeArgNo = 3; | ||||||
3018 | const int DeinitRequiresFullRuntimeArgNo = 2; | ||||||
3019 | |||||||
3020 | auto &Ctx = getAnchorValue().getContext(); | ||||||
3021 | A.changeUseAfterManifest(KernelInitCB->getArgOperandUse(InitIsSPMDArgNo), | ||||||
3022 | *ConstantInt::getBool(Ctx, 1)); | ||||||
3023 | A.changeUseAfterManifest( | ||||||
3024 | KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), | ||||||
3025 | *ConstantInt::getBool(Ctx, 0)); | ||||||
3026 | A.changeUseAfterManifest( | ||||||
3027 | KernelDeinitCB->getArgOperandUse(DeinitIsSPMDArgNo), | ||||||
3028 | *ConstantInt::getBool(Ctx, 1)); | ||||||
3029 | A.changeUseAfterManifest( | ||||||
3030 | KernelInitCB->getArgOperandUse(InitRequiresFullRuntimeArgNo), | ||||||
3031 | *ConstantInt::getBool(Ctx, 0)); | ||||||
3032 | A.changeUseAfterManifest( | ||||||
3033 | KernelDeinitCB->getArgOperandUse(DeinitRequiresFullRuntimeArgNo), | ||||||
3034 | *ConstantInt::getBool(Ctx, 0)); | ||||||
3035 | |||||||
3036 | ++NumOpenMPTargetRegionKernelsSPMD; | ||||||
3037 | |||||||
3038 | auto Remark = [&](OptimizationRemark OR) { | ||||||
3039 | return OR << "Transformed generic-mode kernel to SPMD-mode."; | ||||||
3040 | }; | ||||||
3041 | A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP120", Remark); | ||||||
3042 | return true; | ||||||
3043 | }; | ||||||
3044 | |||||||
3045 | ChangeStatus buildCustomStateMachine(Attributor &A) { | ||||||
3046 | assert(ReachedKnownParallelRegions.isValidState() &&((void)0) | ||||||
3047 | "Custom state machine with invalid parallel region states?")((void)0); | ||||||
3048 | |||||||
3049 | const int InitIsSPMDArgNo = 1; | ||||||
3050 | const int InitUseStateMachineArgNo = 2; | ||||||
3051 | |||||||
3052 | // Check if the current configuration is non-SPMD and generic state machine. | ||||||
3053 | // If we already have SPMD mode or a custom state machine we do not need to | ||||||
3054 | // go any further. If it is anything but a constant something is weird and | ||||||
3055 | // we give up. | ||||||
3056 | ConstantInt *UseStateMachine = dyn_cast<ConstantInt>( | ||||||
3057 | KernelInitCB->getArgOperand(InitUseStateMachineArgNo)); | ||||||
3058 | ConstantInt *IsSPMD = | ||||||
3059 | dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitIsSPMDArgNo)); | ||||||
3060 | |||||||
3061 | // If we are stuck with generic mode, try to create a custom device (=GPU) | ||||||
3062 | // state machine which is specialized for the parallel regions that are | ||||||
3063 | // reachable by the kernel. | ||||||
3064 | if (!UseStateMachine || UseStateMachine->isZero() || !IsSPMD || | ||||||
3065 | !IsSPMD->isZero()) | ||||||
3066 | return ChangeStatus::UNCHANGED; | ||||||
3067 | |||||||
3068 | // If not SPMD mode, indicate we use a custom state machine now. | ||||||
3069 | auto &Ctx = getAnchorValue().getContext(); | ||||||
3070 | auto *FalseVal = ConstantInt::getBool(Ctx, 0); | ||||||
3071 | A.changeUseAfterManifest( | ||||||
3072 | KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal); | ||||||
3073 | |||||||
3074 | // If we don't actually need a state machine we are done here. This can | ||||||
3075 | // happen if there simply are no parallel regions. In the resulting kernel | ||||||
3076 | // all worker threads will simply exit right away, leaving the main thread | ||||||
3077 | // to do the work alone. | ||||||
3078 | if (ReachedKnownParallelRegions.empty() && | ||||||
3079 | ReachedUnknownParallelRegions.empty()) { | ||||||
3080 | ++NumOpenMPTargetRegionKernelsWithoutStateMachine; | ||||||
3081 | |||||||
3082 | auto Remark = [&](OptimizationRemark OR) { | ||||||
3083 | return OR << "Removing unused state machine from generic-mode kernel."; | ||||||
3084 | }; | ||||||
3085 | A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP130", Remark); | ||||||
3086 | |||||||
3087 | return ChangeStatus::CHANGED; | ||||||
3088 | } | ||||||
3089 | |||||||
3090 | // Keep track in the statistics of our new shiny custom state machine. | ||||||
3091 | if (ReachedUnknownParallelRegions.empty()) { | ||||||
3092 | ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback; | ||||||
3093 | |||||||
3094 | auto Remark = [&](OptimizationRemark OR) { | ||||||
3095 | return OR << "Rewriting generic-mode kernel with a customized state " | ||||||
3096 | "machine."; | ||||||
3097 | }; | ||||||
3098 | A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP131", Remark); | ||||||
3099 | } else { | ||||||
3100 | ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback; | ||||||
3101 | |||||||
3102 | auto Remark = [&](OptimizationRemarkAnalysis OR) { | ||||||
3103 | return OR << "Generic-mode kernel is executed with a customized state " | ||||||
3104 | "machine that requires a fallback."; | ||||||
3105 | }; | ||||||
3106 | A.emitRemark<OptimizationRemarkAnalysis>(KernelInitCB, "OMP132", Remark); | ||||||
3107 | |||||||
3108 | // Tell the user why we ended up with a fallback. | ||||||
3109 | for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) { | ||||||
3110 | if (!UnknownParallelRegionCB) | ||||||
3111 | continue; | ||||||
3112 | auto Remark = [&](OptimizationRemarkAnalysis ORA) { | ||||||
3113 | return ORA << "Call may contain unknown parallel regions. Use " | ||||||
3114 | << "`__attribute__((assume(\"omp_no_parallelism\")))` to " | ||||||
3115 | "override."; | ||||||
3116 | }; | ||||||
3117 | A.emitRemark<OptimizationRemarkAnalysis>(UnknownParallelRegionCB, | ||||||
3118 | "OMP133", Remark); | ||||||
3119 | } | ||||||
3120 | } | ||||||
3121 | |||||||
3122 | // Create all the blocks: | ||||||
3123 | // | ||||||
3124 | // InitCB = __kmpc_target_init(...) | ||||||
3125 | // bool IsWorker = InitCB >= 0; | ||||||
3126 | // if (IsWorker) { | ||||||
3127 | // SMBeginBB: __kmpc_barrier_simple_spmd(...); | ||||||
3128 | // void *WorkFn; | ||||||
3129 | // bool Active = __kmpc_kernel_parallel(&WorkFn); | ||||||
3130 | // if (!WorkFn) return; | ||||||
3131 | // SMIsActiveCheckBB: if (Active) { | ||||||
3132 | // SMIfCascadeCurrentBB: if (WorkFn == <ParFn0>) | ||||||
3133 | // ParFn0(...); | ||||||
3134 | // SMIfCascadeCurrentBB: else if (WorkFn == <ParFn1>) | ||||||
3135 | // ParFn1(...); | ||||||
3136 | // ... | ||||||
3137 | // SMIfCascadeCurrentBB: else | ||||||
3138 | // ((WorkFnTy*)WorkFn)(...); | ||||||
3139 | // SMEndParallelBB: __kmpc_kernel_end_parallel(...); | ||||||
3140 | // } | ||||||
3141 | // SMDoneBB: __kmpc_barrier_simple_spmd(...); | ||||||
3142 | // goto SMBeginBB; | ||||||
3143 | // } | ||||||
3144 | // UserCodeEntryBB: // user code | ||||||
3145 | // __kmpc_target_deinit(...) | ||||||
3146 | // | ||||||
3147 | Function *Kernel = getAssociatedFunction(); | ||||||
3148 | assert(Kernel && "Expected an associated function!")((void)0); | ||||||
3149 | |||||||
3150 | BasicBlock *InitBB = KernelInitCB->getParent(); | ||||||
3151 | BasicBlock *UserCodeEntryBB = InitBB->splitBasicBlock( | ||||||
3152 | KernelInitCB->getNextNode(), "thread.user_code.check"); | ||||||
3153 | BasicBlock *StateMachineBeginBB = BasicBlock::Create( | ||||||
3154 | Ctx, "worker_state_machine.begin", Kernel, UserCodeEntryBB); | ||||||
3155 | BasicBlock *StateMachineFinishedBB = BasicBlock::Create( | ||||||
3156 | Ctx, "worker_state_machine.finished", Kernel, UserCodeEntryBB); | ||||||
3157 | BasicBlock *StateMachineIsActiveCheckBB = BasicBlock::Create( | ||||||
3158 | Ctx, "worker_state_machine.is_active.check", Kernel, UserCodeEntryBB); | ||||||
3159 | BasicBlock *StateMachineIfCascadeCurrentBB = | ||||||
3160 | BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check", | ||||||
3161 | Kernel, UserCodeEntryBB); | ||||||
3162 | BasicBlock *StateMachineEndParallelBB = | ||||||
3163 | BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.end", | ||||||
3164 | Kernel, UserCodeEntryBB); | ||||||
3165 | BasicBlock *StateMachineDoneBarrierBB = BasicBlock::Create( | ||||||
3166 | Ctx, "worker_state_machine.done.barrier", Kernel, UserCodeEntryBB); | ||||||
3167 | A.registerManifestAddedBasicBlock(*InitBB); | ||||||
3168 | A.registerManifestAddedBasicBlock(*UserCodeEntryBB); | ||||||
3169 | A.registerManifestAddedBasicBlock(*StateMachineBeginBB); | ||||||
3170 | A.registerManifestAddedBasicBlock(*StateMachineFinishedBB); | ||||||
3171 | A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB); | ||||||
3172 | A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB); | ||||||
3173 | A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB); | ||||||
3174 | A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB); | ||||||
3175 | |||||||
3176 | const DebugLoc &DLoc = KernelInitCB->getDebugLoc(); | ||||||
3177 | ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc); | ||||||
3178 | |||||||
3179 | InitBB->getTerminator()->eraseFromParent(); | ||||||
3180 | Instruction *IsWorker = | ||||||
3181 | ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB, | ||||||
3182 | ConstantInt::get(KernelInitCB->getType(), -1), | ||||||
3183 | "thread.is_worker", InitBB); | ||||||
3184 | IsWorker->setDebugLoc(DLoc); | ||||||
3185 | BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, InitBB); | ||||||
3186 | |||||||
3187 | Module &M = *Kernel->getParent(); | ||||||
3188 | |||||||
3189 | // Create local storage for the work function pointer. | ||||||
3190 | const DataLayout &DL = M.getDataLayout(); | ||||||
3191 | Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); | ||||||
3192 | Instruction *WorkFnAI = | ||||||
3193 | new AllocaInst(VoidPtrTy, DL.getAllocaAddrSpace(), nullptr, | ||||||
3194 | "worker.work_fn.addr", &Kernel->getEntryBlock().front()); | ||||||
3195 | WorkFnAI->setDebugLoc(DLoc); | ||||||
3196 | |||||||
3197 | auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); | ||||||
3198 | OMPInfoCache.OMPBuilder.updateToLocation( | ||||||
3199 | OpenMPIRBuilder::LocationDescription( | ||||||
3200 | IRBuilder<>::InsertPoint(StateMachineBeginBB, | ||||||
3201 | StateMachineBeginBB->end()), | ||||||
3202 | DLoc)); | ||||||
3203 | |||||||
3204 | Value *Ident = KernelInitCB->getArgOperand(0); | ||||||
3205 | Value *GTid = KernelInitCB; | ||||||
3206 | |||||||
3207 | FunctionCallee BarrierFn = | ||||||
3208 | OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( | ||||||
3209 | M, OMPRTL___kmpc_barrier_simple_spmd); | ||||||
3210 | CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB) | ||||||
3211 | ->setDebugLoc(DLoc); | ||||||
3212 | |||||||
3213 | if (WorkFnAI->getType()->getPointerAddressSpace() != | ||||||
3214 | (unsigned int)AddressSpace::Generic) { | ||||||
3215 | WorkFnAI = new AddrSpaceCastInst( | ||||||
3216 | WorkFnAI, | ||||||
3217 | PointerType::getWithSamePointeeType( | ||||||
3218 | cast<PointerType>(WorkFnAI->getType()), | ||||||
3219 | (unsigned int)AddressSpace::Generic), | ||||||
3220 | WorkFnAI->getName() + ".generic", StateMachineBeginBB); | ||||||
3221 | WorkFnAI->setDebugLoc(DLoc); | ||||||
3222 | } | ||||||
3223 | |||||||
3224 | FunctionCallee KernelParallelFn = | ||||||
3225 | OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( | ||||||
3226 | M, OMPRTL___kmpc_kernel_parallel); | ||||||
3227 | Instruction *IsActiveWorker = CallInst::Create( | ||||||
3228 | KernelParallelFn, {WorkFnAI}, "worker.is_active", StateMachineBeginBB); | ||||||
3229 | IsActiveWorker->setDebugLoc(DLoc); | ||||||
3230 | Instruction *WorkFn = new LoadInst(VoidPtrTy, WorkFnAI, "worker.work_fn", | ||||||
3231 | StateMachineBeginBB); | ||||||
3232 | WorkFn->setDebugLoc(DLoc); | ||||||
3233 | |||||||
3234 | FunctionType *ParallelRegionFnTy = FunctionType::get( | ||||||
3235 | Type::getVoidTy(Ctx), {Type::getInt16Ty(Ctx), Type::getInt32Ty(Ctx)}, | ||||||
3236 | false); | ||||||
3237 | Value *WorkFnCast = BitCastInst::CreatePointerBitCastOrAddrSpaceCast( | ||||||
3238 | WorkFn, ParallelRegionFnTy->getPointerTo(), "worker.work_fn.addr_cast", | ||||||
3239 | StateMachineBeginBB); | ||||||
3240 | |||||||
3241 | Instruction *IsDone = | ||||||
3242 | ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFn, | ||||||
3243 | Constant::getNullValue(VoidPtrTy), "worker.is_done", | ||||||
3244 | StateMachineBeginBB); | ||||||
3245 | IsDone->setDebugLoc(DLoc); | ||||||
3246 | BranchInst::Create(StateMachineFinishedBB, StateMachineIsActiveCheckBB, | ||||||
3247 | IsDone, StateMachineBeginBB) | ||||||
3248 | ->setDebugLoc(DLoc); | ||||||
3249 | |||||||
3250 | BranchInst::Create(StateMachineIfCascadeCurrentBB, | ||||||
3251 | StateMachineDoneBarrierBB, IsActiveWorker, | ||||||
3252 | StateMachineIsActiveCheckBB) | ||||||
3253 | ->setDebugLoc(DLoc); | ||||||
3254 | |||||||
3255 | Value *ZeroArg = | ||||||
3256 | Constant::getNullValue(ParallelRegionFnTy->getParamType(0)); | ||||||
3257 | |||||||
3258 | // Now that we have most of the CFG skeleton it is time for the if-cascade | ||||||
3259 | // that checks the function pointer we got from the runtime against the | ||||||
3260 | // parallel regions we expect, if there are any. | ||||||
3261 | for (int i = 0, e = ReachedKnownParallelRegions.size(); i < e; ++i) { | ||||||
3262 | auto *ParallelRegion = ReachedKnownParallelRegions[i]; | ||||||
3263 | BasicBlock *PRExecuteBB = BasicBlock::Create( | ||||||
3264 | Ctx, "worker_state_machine.parallel_region.execute", Kernel, | ||||||
3265 | StateMachineEndParallelBB); | ||||||
3266 | CallInst::Create(ParallelRegion, {ZeroArg, GTid}, "", PRExecuteBB) | ||||||
3267 | ->setDebugLoc(DLoc); | ||||||
3268 | BranchInst::Create(StateMachineEndParallelBB, PRExecuteBB) | ||||||
3269 | ->setDebugLoc(DLoc); | ||||||
3270 | |||||||
3271 | BasicBlock *PRNextBB = | ||||||
3272 | BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check", | ||||||
3273 | Kernel, StateMachineEndParallelBB); | ||||||
3274 | |||||||
3275 | // Check if we need to compare the pointer at all or if we can just | ||||||
3276 | // call the parallel region function. | ||||||
3277 | Value *IsPR; | ||||||
3278 | if (i + 1 < e || !ReachedUnknownParallelRegions.empty()) { | ||||||
3279 | Instruction *CmpI = ICmpInst::Create( | ||||||
3280 | ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFnCast, ParallelRegion, | ||||||
3281 | "worker.check_parallel_region", StateMachineIfCascadeCurrentBB); | ||||||
3282 | CmpI->setDebugLoc(DLoc); | ||||||
3283 | IsPR = CmpI; | ||||||
3284 | } else { | ||||||
3285 | IsPR = ConstantInt::getTrue(Ctx); | ||||||
3286 | } | ||||||
3287 | |||||||
3288 | BranchInst::Create(PRExecuteBB, PRNextBB, IsPR, | ||||||
3289 | StateMachineIfCascadeCurrentBB) | ||||||
3290 | ->setDebugLoc(DLoc); | ||||||
3291 | StateMachineIfCascadeCurrentBB = PRNextBB; | ||||||
3292 | } | ||||||
3293 | |||||||
3294 | // At the end of the if-cascade we place the indirect function pointer call | ||||||
3295 | // in case we might need it, that is if there can be parallel regions we | ||||||
3296 | // have not handled in the if-cascade above. | ||||||
3297 | if (!ReachedUnknownParallelRegions.empty()) { | ||||||
3298 | StateMachineIfCascadeCurrentBB->setName( | ||||||
3299 | "worker_state_machine.parallel_region.fallback.execute"); | ||||||
3300 | CallInst::Create(ParallelRegionFnTy, WorkFnCast, {ZeroArg, GTid}, "", | ||||||
3301 | StateMachineIfCascadeCurrentBB) | ||||||
3302 | ->setDebugLoc(DLoc); | ||||||
3303 | } | ||||||
3304 | BranchInst::Create(StateMachineEndParallelBB, | ||||||
3305 | StateMachineIfCascadeCurrentBB) | ||||||
3306 | ->setDebugLoc(DLoc); | ||||||
3307 | |||||||
3308 | CallInst::Create(OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction( | ||||||
3309 | M, OMPRTL___kmpc_kernel_end_parallel), | ||||||
3310 | {}, "", StateMachineEndParallelBB) | ||||||
3311 | ->setDebugLoc(DLoc); | ||||||
3312 | BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB) | ||||||
3313 | ->setDebugLoc(DLoc); | ||||||
3314 | |||||||
3315 | CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineDoneBarrierBB) | ||||||
3316 | ->setDebugLoc(DLoc); | ||||||
3317 | BranchInst::Create(StateMachineBeginBB, StateMachineDoneBarrierBB) | ||||||
3318 | ->setDebugLoc(DLoc); | ||||||
3319 | |||||||
3320 | return ChangeStatus::CHANGED; | ||||||
3321 | } | ||||||
3322 | |||||||
3323 | /// Fixpoint iteration update function. Will be called every time a dependence | ||||||
3324 | /// changed its state (and in the beginning). | ||||||
3325 | ChangeStatus updateImpl(Attributor &A) override { | ||||||
3326 | KernelInfoState StateBefore = getState(); | ||||||
3327 | |||||||
3328 | // Callback to check a read/write instruction. | ||||||
3329 | auto CheckRWInst = [&](Instruction &I) { | ||||||
3330 | // We handle calls later. | ||||||
3331 | if (isa<CallBase>(I)) | ||||||
3332 | return true; | ||||||
3333 | // We only care about write effects. | ||||||
3334 | if (!I.mayWriteToMemory()) | ||||||
3335 | return true; | ||||||
3336 | if (auto *SI = dyn_cast<StoreInst>(&I)) { | ||||||
3337 | SmallVector<const Value *> Objects; | ||||||
3338 | getUnderlyingObjects(SI->getPointerOperand(), Objects); | ||||||
3339 | if (llvm::all_of(Objects, | ||||||
3340 | [](const Value *Obj) { return isa<AllocaInst>(Obj); })) | ||||||
3341 | return true; | ||||||
3342 | } | ||||||
3343 | // For now we give up on everything but stores. | ||||||
3344 | SPMDCompatibilityTracker.insert(&I); | ||||||
3345 | return true; | ||||||
3346 | }; | ||||||
3347 | |||||||
3348 | bool UsedAssumedInformationInCheckRWInst = false; | ||||||
3349 | if (!SPMDCompatibilityTracker.isAtFixpoint()) | ||||||
3350 | if (!A.checkForAllReadWriteInstructions( | ||||||
3351 | CheckRWInst, *this, UsedAssumedInformationInCheckRWInst)) | ||||||
3352 | SPMDCompatibilityTracker.indicatePessimisticFixpoint(); | ||||||
3353 | |||||||
3354 | if (!IsKernelEntry) { | ||||||
3355 | updateReachingKernelEntries(A); | ||||||
3356 | updateParallelLevels(A); | ||||||
3357 | } | ||||||
3358 | |||||||
3359 | // Callback to check a call instruction. | ||||||
3360 | bool AllSPMDStatesWereFixed = true; | ||||||
3361 | auto CheckCallInst = [&](Instruction &I) { | ||||||
3362 | auto &CB = cast<CallBase>(I); | ||||||
3363 | auto &CBAA = A.getAAFor<AAKernelInfo>( | ||||||
3364 | *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL); | ||||||
3365 | getState() ^= CBAA.getState(); | ||||||
3366 | AllSPMDStatesWereFixed &= CBAA.SPMDCompatibilityTracker.isAtFixpoint(); | ||||||
3367 | return true; | ||||||
3368 | }; | ||||||
3369 | |||||||
3370 | bool UsedAssumedInformationInCheckCallInst = false; | ||||||
3371 | if (!A.checkForAllCallLikeInstructions( | ||||||
3372 | CheckCallInst, *this, UsedAssumedInformationInCheckCallInst)) | ||||||
3373 | return indicatePessimisticFixpoint(); | ||||||
3374 | |||||||
3375 | // If we haven't used any assumed information for the SPMD state we can fix | ||||||
3376 | // it. | ||||||
3377 | if (!UsedAssumedInformationInCheckRWInst && | ||||||
3378 | !UsedAssumedInformationInCheckCallInst && AllSPMDStatesWereFixed) | ||||||
3379 | SPMDCompatibilityTracker.indicateOptimisticFixpoint(); | ||||||
3380 | |||||||
3381 | return StateBefore == getState() ? ChangeStatus::UNCHANGED | ||||||
3382 | : ChangeStatus::CHANGED; | ||||||
3383 | } | ||||||
3384 | |||||||
3385 | private: | ||||||
3386 | /// Update info regarding reaching kernels. | ||||||
3387 | void updateReachingKernelEntries(Attributor &A) { | ||||||
3388 | auto PredCallSite = [&](AbstractCallSite ACS) { | ||||||
3389 | Function *Caller = ACS.getInstruction()->getFunction(); | ||||||
3390 | |||||||
3391 | assert(Caller && "Caller is nullptr")((void)0); | ||||||
3392 | |||||||
3393 | auto &CAA = A.getOrCreateAAFor<AAKernelInfo>( | ||||||
3394 | IRPosition::function(*Caller), this, DepClassTy::REQUIRED); | ||||||
3395 | if (CAA.ReachingKernelEntries.isValidState()) { | ||||||
3396 | ReachingKernelEntries ^= CAA.ReachingKernelEntries; | ||||||
3397 | return true; | ||||||
3398 | } | ||||||
3399 | |||||||
3400 | // We lost track of the caller of the associated function, any kernel | ||||||
3401 | // could reach now. | ||||||
3402 | ReachingKernelEntries.indicatePessimisticFixpoint(); | ||||||
3403 | |||||||
3404 | return true; | ||||||
3405 | }; | ||||||
3406 | |||||||
3407 | bool AllCallSitesKnown; | ||||||
3408 | if (!A.checkForAllCallSites(PredCallSite, *this, | ||||||
3409 | true /* RequireAllCallSites */, | ||||||
3410 | AllCallSitesKnown)) | ||||||
3411 | ReachingKernelEntries.indicatePessimisticFixpoint(); | ||||||
3412 | } | ||||||
3413 | |||||||
3414 | /// Update info regarding parallel levels. | ||||||
3415 | void updateParallelLevels(Attributor &A) { | ||||||
3416 | auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); | ||||||
3417 | OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI = | ||||||
3418 | OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51]; | ||||||
3419 | |||||||
3420 | auto PredCallSite = [&](AbstractCallSite ACS) { | ||||||
3421 | Function *Caller = ACS.getInstruction()->getFunction(); | ||||||
3422 | |||||||
3423 | assert(Caller && "Caller is nullptr")((void)0); | ||||||
3424 | |||||||
3425 | auto &CAA = | ||||||
3426 | A.getOrCreateAAFor<AAKernelInfo>(IRPosition::function(*Caller)); | ||||||
3427 | if (CAA.ParallelLevels.isValidState()) { | ||||||
3428 | // Any function that is called by `__kmpc_parallel_51` will not be | ||||||
3429 | // folded as the parallel level in the function is updated. In order to | ||||||
3430 | // get it right, all the analysis would depend on the implentation. That | ||||||
3431 | // said, if in the future any change to the implementation, the analysis | ||||||
3432 | // could be wrong. As a consequence, we are just conservative here. | ||||||
3433 | if (Caller == Parallel51RFI.Declaration) { | ||||||
3434 | ParallelLevels.indicatePessimisticFixpoint(); | ||||||
3435 | return true; | ||||||
3436 | } | ||||||
3437 | |||||||
3438 | ParallelLevels ^= CAA.ParallelLevels; | ||||||
3439 | |||||||
3440 | return true; | ||||||
3441 | } | ||||||
3442 | |||||||
3443 | // We lost track of the caller of the associated function, any kernel | ||||||
3444 | // could reach now. | ||||||
3445 | ParallelLevels.indicatePessimisticFixpoint(); | ||||||
3446 | |||||||
3447 | return true; | ||||||
3448 | }; | ||||||
3449 | |||||||
3450 | bool AllCallSitesKnown = true; | ||||||
3451 | if (!A.checkForAllCallSites(PredCallSite, *this, | ||||||
3452 | true /* RequireAllCallSites */, | ||||||
3453 | AllCallSitesKnown)) | ||||||
3454 | ParallelLevels.indicatePessimisticFixpoint(); | ||||||
3455 | } | ||||||
3456 | }; | ||||||
3457 | |||||||
3458 | /// The call site kernel info abstract attribute, basically, what can we say | ||||||
3459 | /// about a call site with regards to the KernelInfoState. For now this simply | ||||||
3460 | /// forwards the information from the callee. | ||||||
3461 | struct AAKernelInfoCallSite : AAKernelInfo { | ||||||
3462 | AAKernelInfoCallSite(const IRPosition &IRP, Attributor &A) | ||||||
3463 | : AAKernelInfo(IRP, A) {} | ||||||
3464 | |||||||
3465 | /// See AbstractAttribute::initialize(...). | ||||||
3466 | void initialize(Attributor &A) override { | ||||||
3467 | AAKernelInfo::initialize(A); | ||||||
3468 | |||||||
3469 | CallBase &CB = cast<CallBase>(getAssociatedValue()); | ||||||
3470 | Function *Callee = getAssociatedFunction(); | ||||||
3471 | |||||||
3472 | // Helper to lookup an assumption string. | ||||||
3473 | auto HasAssumption = [](Function *Fn, StringRef AssumptionStr) { | ||||||
3474 | return Fn && hasAssumption(*Fn, AssumptionStr); | ||||||
3475 | }; | ||||||
3476 | |||||||
3477 | // Check for SPMD-mode assumptions. | ||||||
3478 | if (HasAssumption(Callee, "ompx_spmd_amenable")) | ||||||
3479 | SPMDCompatibilityTracker.indicateOptimisticFixpoint(); | ||||||
3480 | |||||||
3481 | // First weed out calls we do not care about, that is readonly/readnone | ||||||
3482 | // calls, intrinsics, and "no_openmp" calls. Neither of these can reach a | ||||||
3483 | // parallel region or anything else we are looking for. | ||||||
3484 | if (!CB.mayWriteToMemory() || isa<IntrinsicInst>(CB)) { | ||||||
3485 | indicateOptimisticFixpoint(); | ||||||
3486 | return; | ||||||
3487 | } | ||||||
3488 | |||||||
3489 | // Next we check if we know the callee. If it is a known OpenMP function | ||||||
3490 | // we will handle them explicitly in the switch below. If it is not, we | ||||||
3491 | // will use an AAKernelInfo object on the callee to gather information and | ||||||
3492 | // merge that into the current state. The latter happens in the updateImpl. | ||||||
3493 | auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); | ||||||
3494 | const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee); | ||||||
3495 | if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) { | ||||||
3496 | // Unknown caller or declarations are not analyzable, we give up. | ||||||
3497 | if (!Callee || !A.isFunctionIPOAmendable(*Callee)) { | ||||||
3498 | |||||||
3499 | // Unknown callees might contain parallel regions, except if they have | ||||||
3500 | // an appropriate assumption attached. | ||||||
3501 | if (!(HasAssumption(Callee, "omp_no_openmp") || | ||||||
3502 | HasAssumption(Callee, "omp_no_parallelism"))) | ||||||
3503 | ReachedUnknownParallelRegions.insert(&CB); | ||||||
3504 | |||||||
3505 | // If SPMDCompatibilityTracker is not fixed, we need to give up on the | ||||||
3506 | // idea we can run something unknown in SPMD-mode. | ||||||
3507 | if (!SPMDCompatibilityTracker.isAtFixpoint()) | ||||||
3508 | SPMDCompatibilityTracker.insert(&CB); | ||||||
3509 | |||||||
3510 | // We have updated the state for this unknown call properly, there won't | ||||||
3511 | // be any change so we indicate a fixpoint. | ||||||
3512 | indicateOptimisticFixpoint(); | ||||||
3513 | } | ||||||
3514 | // If the callee is known and can be used in IPO, we will update the state | ||||||
3515 | // based on the callee state in updateImpl. | ||||||
3516 | return; | ||||||
3517 | } | ||||||
3518 | |||||||
3519 | const unsigned int WrapperFunctionArgNo = 6; | ||||||
3520 | RuntimeFunction RF = It->getSecond(); | ||||||
3521 | switch (RF) { | ||||||
3522 | // All the functions we know are compatible with SPMD mode. | ||||||
3523 | case OMPRTL___kmpc_is_spmd_exec_mode: | ||||||
3524 | case OMPRTL___kmpc_for_static_fini: | ||||||
3525 | case OMPRTL___kmpc_global_thread_num: | ||||||
3526 | case OMPRTL___kmpc_get_hardware_num_threads_in_block: | ||||||
3527 | case OMPRTL___kmpc_get_hardware_num_blocks: | ||||||
3528 | case OMPRTL___kmpc_single: | ||||||
3529 | case OMPRTL___kmpc_end_single: | ||||||
3530 | case OMPRTL___kmpc_master: | ||||||
3531 | case OMPRTL___kmpc_end_master: | ||||||
3532 | case OMPRTL___kmpc_barrier: | ||||||
3533 | break; | ||||||
3534 | case OMPRTL___kmpc_for_static_init_4: | ||||||
3535 | case OMPRTL___kmpc_for_static_init_4u: | ||||||
3536 | case OMPRTL___kmpc_for_static_init_8: | ||||||
3537 | case OMPRTL___kmpc_for_static_init_8u: { | ||||||
3538 | // Check the schedule and allow static schedule in SPMD mode. | ||||||
3539 | unsigned ScheduleArgOpNo = 2; | ||||||
3540 | auto *ScheduleTypeCI = | ||||||
3541 | dyn_cast<ConstantInt>(CB.getArgOperand(ScheduleArgOpNo)); | ||||||
3542 | unsigned ScheduleTypeVal = | ||||||
3543 | ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0; | ||||||
3544 | switch (OMPScheduleType(ScheduleTypeVal)) { | ||||||
3545 | case OMPScheduleType::Static: | ||||||
3546 | case OMPScheduleType::StaticChunked: | ||||||
3547 | case OMPScheduleType::Distribute: | ||||||
3548 | case OMPScheduleType::DistributeChunked: | ||||||
3549 | break; | ||||||
3550 | default: | ||||||
3551 | SPMDCompatibilityTracker.insert(&CB); | ||||||
3552 | break; | ||||||
3553 | }; | ||||||
3554 | } break; | ||||||
3555 | case OMPRTL___kmpc_target_init: | ||||||
3556 | KernelInitCB = &CB; | ||||||
3557 | break; | ||||||
3558 | case OMPRTL___kmpc_target_deinit: | ||||||
3559 | KernelDeinitCB = &CB; | ||||||
3560 | break; | ||||||
3561 | case OMPRTL___kmpc_parallel_51: | ||||||
3562 | if (auto *ParallelRegion = dyn_cast<Function>( | ||||||
3563 | CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts())) { | ||||||
3564 | ReachedKnownParallelRegions.insert(ParallelRegion); | ||||||
3565 | break; | ||||||
3566 | } | ||||||
3567 | // The condition above should usually get the parallel region function | ||||||
3568 | // pointer and record it. In the off chance it doesn't we assume the | ||||||
3569 | // worst. | ||||||
3570 | ReachedUnknownParallelRegions.insert(&CB); | ||||||
3571 | break; | ||||||
3572 | case OMPRTL___kmpc_omp_task: | ||||||
3573 | // We do not look into tasks right now, just give up. | ||||||
3574 | SPMDCompatibilityTracker.insert(&CB); | ||||||
3575 | ReachedUnknownParallelRegions.insert(&CB); | ||||||
3576 | break; | ||||||
3577 | case OMPRTL___kmpc_alloc_shared: | ||||||
3578 | case OMPRTL___kmpc_free_shared: | ||||||
3579 | // Return without setting a fixpoint, to be resolved in updateImpl. | ||||||
3580 | return; | ||||||
3581 | default: | ||||||
3582 | // Unknown OpenMP runtime calls cannot be executed in SPMD-mode, | ||||||
3583 | // generally. | ||||||
3584 | SPMDCompatibilityTracker.insert(&CB); | ||||||
3585 | break; | ||||||
3586 | } | ||||||
3587 | // All other OpenMP runtime calls will not reach parallel regions so they | ||||||
3588 | // can be safely ignored for now. Since it is a known OpenMP runtime call we | ||||||
3589 | // have now modeled all effects and there is no need for any update. | ||||||
3590 | indicateOptimisticFixpoint(); | ||||||
3591 | } | ||||||
3592 | |||||||
3593 | ChangeStatus updateImpl(Attributor &A) override { | ||||||
3594 | // TODO: Once we have call site specific value information we can provide | ||||||
3595 | // call site specific liveness information and then it makes | ||||||
3596 | // sense to specialize attributes for call sites arguments instead of | ||||||
3597 | // redirecting requests to the callee argument. | ||||||
3598 | Function *F = getAssociatedFunction(); | ||||||
3599 | |||||||
3600 | auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); | ||||||
3601 | const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(F); | ||||||
3602 | |||||||
3603 | // If F is not a runtime function, propagate the AAKernelInfo of the callee. | ||||||
3604 | if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) { | ||||||
3605 | const IRPosition &FnPos = IRPosition::function(*F); | ||||||
3606 | auto &FnAA = A.getAAFor<AAKernelInfo>(*this, FnPos, DepClassTy::REQUIRED); | ||||||
3607 | if (getState() == FnAA.getState()) | ||||||
3608 | return ChangeStatus::UNCHANGED; | ||||||
3609 | getState() = FnAA.getState(); | ||||||
3610 | return ChangeStatus::CHANGED; | ||||||
3611 | } | ||||||
3612 | |||||||
3613 | // F is a runtime function that allocates or frees memory, check | ||||||
3614 | // AAHeapToStack and AAHeapToShared. | ||||||
3615 | KernelInfoState StateBefore = getState(); | ||||||
3616 | assert((It->getSecond() == OMPRTL___kmpc_alloc_shared ||((void)0) | ||||||
3617 | It->getSecond() == OMPRTL___kmpc_free_shared) &&((void)0) | ||||||
3618 | "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call")((void)0); | ||||||
3619 | |||||||
3620 | CallBase &CB = cast<CallBase>(getAssociatedValue()); | ||||||
3621 | |||||||
3622 | auto &HeapToStackAA = A.getAAFor<AAHeapToStack>( | ||||||
3623 | *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL); | ||||||
3624 | auto &HeapToSharedAA = A.getAAFor<AAHeapToShared>( | ||||||
3625 | *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL); | ||||||
3626 | |||||||
3627 | RuntimeFunction RF = It->getSecond(); | ||||||
3628 | |||||||
3629 | switch (RF) { | ||||||
3630 | // If neither HeapToStack nor HeapToShared assume the call is removed, | ||||||
3631 | // assume SPMD incompatibility. | ||||||
3632 | case OMPRTL___kmpc_alloc_shared: | ||||||
3633 | if (!HeapToStackAA.isAssumedHeapToStack(CB) && | ||||||
3634 | !HeapToSharedAA.isAssumedHeapToShared(CB)) | ||||||
3635 | SPMDCompatibilityTracker.insert(&CB); | ||||||
3636 | break; | ||||||
3637 | case OMPRTL___kmpc_free_shared: | ||||||
3638 | if (!HeapToStackAA.isAssumedHeapToStackRemovedFree(CB) && | ||||||
3639 | !HeapToSharedAA.isAssumedHeapToSharedRemovedFree(CB)) | ||||||
3640 | SPMDCompatibilityTracker.insert(&CB); | ||||||
3641 | break; | ||||||
3642 | default: | ||||||
3643 | SPMDCompatibilityTracker.insert(&CB); | ||||||
3644 | } | ||||||
3645 | |||||||
3646 | return StateBefore == getState() ? ChangeStatus::UNCHANGED | ||||||
3647 | : ChangeStatus::CHANGED; | ||||||
3648 | } | ||||||
3649 | }; | ||||||
3650 | |||||||
3651 | struct AAFoldRuntimeCall | ||||||
3652 | : public StateWrapper<BooleanState, AbstractAttribute> { | ||||||
3653 | using Base = StateWrapper<BooleanState, AbstractAttribute>; | ||||||
3654 | |||||||
3655 | AAFoldRuntimeCall(const IRPosition &IRP, Attributor &A) : Base(IRP) {} | ||||||
3656 | |||||||
3657 | /// Statistics are tracked as part of manifest for now. | ||||||
3658 | void trackStatistics() const override {} | ||||||
3659 | |||||||
3660 | /// Create an abstract attribute biew for the position \p IRP. | ||||||
3661 | static AAFoldRuntimeCall &createForPosition(const IRPosition &IRP, | ||||||
3662 | Attributor &A); | ||||||
3663 | |||||||
3664 | /// See AbstractAttribute::getName() | ||||||
3665 | const std::string getName() const override { return "AAFoldRuntimeCall"; } | ||||||
3666 | |||||||
3667 | /// See AbstractAttribute::getIdAddr() | ||||||
3668 | const char *getIdAddr() const override { return &ID; } | ||||||
3669 | |||||||
3670 | /// This function should return true if the type of the \p AA is | ||||||
3671 | /// AAFoldRuntimeCall | ||||||
3672 | static bool classof(const AbstractAttribute *AA) { | ||||||
3673 | return (AA->getIdAddr() == &ID); | ||||||
3674 | } | ||||||
3675 | |||||||
3676 | static const char ID; | ||||||
3677 | }; | ||||||
3678 | |||||||
3679 | struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall { | ||||||
3680 | AAFoldRuntimeCallCallSiteReturned(const IRPosition &IRP, Attributor &A) | ||||||
3681 | : AAFoldRuntimeCall(IRP, A) {} | ||||||
3682 | |||||||
3683 | /// See AbstractAttribute::getAsStr() | ||||||
3684 | const std::string getAsStr() const override { | ||||||
3685 | if (!isValidState()) | ||||||
3686 | return "<invalid>"; | ||||||
3687 | |||||||
3688 | std::string Str("simplified value: "); | ||||||
3689 | |||||||
3690 | if (!SimplifiedValue.hasValue()) | ||||||
3691 | return Str + std::string("none"); | ||||||
3692 | |||||||
3693 | if (!SimplifiedValue.getValue()) | ||||||
3694 | return Str + std::string("nullptr"); | ||||||
3695 | |||||||
3696 | if (ConstantInt *CI = dyn_cast<ConstantInt>(SimplifiedValue.getValue())) | ||||||
3697 | return Str + std::to_string(CI->getSExtValue()); | ||||||
3698 | |||||||
3699 | return Str + std::string("unknown"); | ||||||
3700 | } | ||||||
3701 | |||||||
3702 | void initialize(Attributor &A) override { | ||||||
3703 | Function *Callee = getAssociatedFunction(); | ||||||
3704 | |||||||
3705 | auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache()); | ||||||
3706 | const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee); | ||||||
3707 | assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&((void)0) | ||||||
3708 | "Expected a known OpenMP runtime function")((void)0); | ||||||
3709 | |||||||
3710 | RFKind = It->getSecond(); | ||||||
3711 | |||||||
3712 | CallBase &CB = cast<CallBase>(getAssociatedValue()); | ||||||
3713 | A.registerSimplificationCallback( | ||||||
3714 | IRPosition::callsite_returned(CB), | ||||||
3715 | [&](const IRPosition &IRP, const AbstractAttribute *AA, | ||||||
3716 | bool &UsedAssumedInformation) -> Optional<Value *> { | ||||||
3717 | assert((isValidState() || (SimplifiedValue.hasValue() &&((void)0) | ||||||
3718 | SimplifiedValue.getValue() == nullptr)) &&((void)0) | ||||||
3719 | "Unexpected invalid state!")((void)0); | ||||||
3720 | |||||||
3721 | if (!isAtFixpoint()) { | ||||||
3722 | UsedAssumedInformation = true; | ||||||
3723 | if (AA) | ||||||
3724 | A.recordDependence(*this, *AA, DepClassTy::OPTIONAL); | ||||||
3725 | } | ||||||
3726 | return SimplifiedValue; | ||||||
3727 | }); | ||||||
3728 | } | ||||||
3729 | |||||||
3730 | ChangeStatus updateImpl(Attributor &A) override { | ||||||
3731 | ChangeStatus Changed = ChangeStatus::UNCHANGED; | ||||||
3732 | switch (RFKind) { | ||||||
3733 | case OMPRTL___kmpc_is_spmd_exec_mode: | ||||||
3734 | Changed |= foldIsSPMDExecMode(A); | ||||||
3735 | break; | ||||||
3736 | case OMPRTL___kmpc_is_generic_main_thread_id: | ||||||
3737 | Changed |= foldIsGenericMainThread(A); | ||||||
3738 | break; | ||||||
3739 | case OMPRTL___kmpc_parallel_level: | ||||||
3740 | Changed |= foldParallelLevel(A); | ||||||
3741 | break; | ||||||
3742 | case OMPRTL___kmpc_get_hardware_num_threads_in_block: | ||||||
3743 | Changed = Changed | foldKernelFnAttribute(A, "omp_target_thread_limit"); | ||||||
3744 | break; | ||||||
3745 | case OMPRTL___kmpc_get_hardware_num_blocks: | ||||||
3746 | Changed = Changed | foldKernelFnAttribute(A, "omp_target_num_teams"); | ||||||
3747 | break; | ||||||
3748 | default: | ||||||
3749 | llvm_unreachable("Unhandled OpenMP runtime function!")__builtin_unreachable(); | ||||||
3750 | } | ||||||
3751 | |||||||
3752 | return Changed; | ||||||
3753 | } | ||||||
3754 | |||||||
3755 | ChangeStatus manifest(Attributor &A) override { | ||||||
3756 | ChangeStatus Changed = ChangeStatus::UNCHANGED; | ||||||
3757 | |||||||
3758 | if (SimplifiedValue.hasValue() && SimplifiedValue.getValue()) { | ||||||
3759 | Instruction &CB = *getCtxI(); | ||||||
3760 | A.changeValueAfterManifest(CB, **SimplifiedValue); | ||||||
3761 | A.deleteAfterManifest(CB); | ||||||
3762 | |||||||
3763 | LLVM_DEBUG(dbgs() << TAG << "Folding runtime call: " << CB << " with "do { } while (false) | ||||||
3764 | << **SimplifiedValue << "\n")do { } while (false); | ||||||
3765 | |||||||
3766 | Changed = ChangeStatus::CHANGED; | ||||||
3767 | } | ||||||
3768 | |||||||
3769 | return Changed; | ||||||
3770 | } | ||||||
3771 | |||||||
3772 | ChangeStatus indicatePessimisticFixpoint() override { | ||||||
3773 | SimplifiedValue = nullptr; | ||||||
3774 | return AAFoldRuntimeCall::indicatePessimisticFixpoint(); | ||||||
3775 | } | ||||||
3776 | |||||||
3777 | private: | ||||||
3778 | /// Fold __kmpc_is_spmd_exec_mode into a constant if possible. | ||||||
3779 | ChangeStatus foldIsSPMDExecMode(Attributor &A) { | ||||||
3780 | Optional<Value *> SimplifiedValueBefore = SimplifiedValue; | ||||||
3781 | |||||||
3782 | unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0; | ||||||
3783 | unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0; | ||||||
3784 | auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>( | ||||||
3785 | *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); | ||||||
3786 | |||||||
3787 | if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState()) | ||||||
3788 | return indicatePessimisticFixpoint(); | ||||||
3789 | |||||||
3790 | for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) { | ||||||
3791 | auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K), | ||||||
3792 | DepClassTy::REQUIRED); | ||||||
3793 | |||||||
3794 | if (!AA.isValidState()) { | ||||||
3795 | SimplifiedValue = nullptr; | ||||||
3796 | return indicatePessimisticFixpoint(); | ||||||
3797 | } | ||||||
3798 | |||||||
3799 | if (AA.SPMDCompatibilityTracker.isAssumed()) { | ||||||
3800 | if (AA.SPMDCompatibilityTracker.isAtFixpoint()) | ||||||
3801 | ++KnownSPMDCount; | ||||||
3802 | else | ||||||
3803 | ++AssumedSPMDCount; | ||||||
3804 | } else { | ||||||
3805 | if (AA.SPMDCompatibilityTracker.isAtFixpoint()) | ||||||
3806 | ++KnownNonSPMDCount; | ||||||
3807 | else | ||||||
3808 | ++AssumedNonSPMDCount; | ||||||
3809 | } | ||||||
3810 | } | ||||||
3811 | |||||||
3812 | if ((AssumedSPMDCount + KnownSPMDCount) && | ||||||
3813 | (AssumedNonSPMDCount + KnownNonSPMDCount)) | ||||||
3814 | return indicatePessimisticFixpoint(); | ||||||
3815 | |||||||
3816 | auto &Ctx = getAnchorValue().getContext(); | ||||||
3817 | if (KnownSPMDCount || AssumedSPMDCount) { | ||||||
3818 | assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&((void)0) | ||||||
3819 | "Expected only SPMD kernels!")((void)0); | ||||||
3820 | // All reaching kernels are in SPMD mode. Update all function calls to | ||||||
3821 | // __kmpc_is_spmd_exec_mode to 1. | ||||||
3822 | SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), true); | ||||||
3823 | } else if (KnownNonSPMDCount || AssumedNonSPMDCount) { | ||||||
3824 | assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&((void)0) | ||||||
3825 | "Expected only non-SPMD kernels!")((void)0); | ||||||
3826 | // All reaching kernels are in non-SPMD mode. Update all function | ||||||
3827 | // calls to __kmpc_is_spmd_exec_mode to 0. | ||||||
3828 | SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), false); | ||||||
3829 | } else { | ||||||
3830 | // We have empty reaching kernels, therefore we cannot tell if the | ||||||
3831 | // associated call site can be folded. At this moment, SimplifiedValue | ||||||
3832 | // must be none. | ||||||
3833 | assert(!SimplifiedValue.hasValue() && "SimplifiedValue should be none")((void)0); | ||||||
3834 | } | ||||||
3835 | |||||||
3836 | return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED | ||||||
3837 | : ChangeStatus::CHANGED; | ||||||
3838 | } | ||||||
3839 | |||||||
3840 | /// Fold __kmpc_is_generic_main_thread_id into a constant if possible. | ||||||
3841 | ChangeStatus foldIsGenericMainThread(Attributor &A) { | ||||||
3842 | Optional<Value *> SimplifiedValueBefore = SimplifiedValue; | ||||||
3843 | |||||||
3844 | CallBase &CB = cast<CallBase>(getAssociatedValue()); | ||||||
3845 | Function *F = CB.getFunction(); | ||||||
3846 | const auto &ExecutionDomainAA = A.getAAFor<AAExecutionDomain>( | ||||||
3847 | *this, IRPosition::function(*F), DepClassTy::REQUIRED); | ||||||
3848 | |||||||
3849 | if (!ExecutionDomainAA.isValidState()) | ||||||
3850 | return indicatePessimisticFixpoint(); | ||||||
3851 | |||||||
3852 | auto &Ctx = getAnchorValue().getContext(); | ||||||
3853 | if (ExecutionDomainAA.isExecutedByInitialThreadOnly(CB)) | ||||||
3854 | SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), true); | ||||||
3855 | else | ||||||
3856 | return indicatePessimisticFixpoint(); | ||||||
3857 | |||||||
3858 | return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED | ||||||
3859 | : ChangeStatus::CHANGED; | ||||||
3860 | } | ||||||
3861 | |||||||
3862 | /// Fold __kmpc_parallel_level into a constant if possible. | ||||||
3863 | ChangeStatus foldParallelLevel(Attributor &A) { | ||||||
3864 | Optional<Value *> SimplifiedValueBefore = SimplifiedValue; | ||||||
3865 | |||||||
3866 | auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>( | ||||||
3867 | *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); | ||||||
3868 | |||||||
3869 | if (!CallerKernelInfoAA.ParallelLevels.isValidState()) | ||||||
3870 | return indicatePessimisticFixpoint(); | ||||||
3871 | |||||||
3872 | if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState()) | ||||||
3873 | return indicatePessimisticFixpoint(); | ||||||
3874 | |||||||
3875 | if (CallerKernelInfoAA.ReachingKernelEntries.empty()) { | ||||||
3876 | assert(!SimplifiedValue.hasValue() &&((void)0) | ||||||
3877 | "SimplifiedValue should keep none at this point")((void)0); | ||||||
3878 | return ChangeStatus::UNCHANGED; | ||||||
3879 | } | ||||||
3880 | |||||||
3881 | unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0; | ||||||
3882 | unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0; | ||||||
3883 | for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) { | ||||||
3884 | auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K), | ||||||
3885 | DepClassTy::REQUIRED); | ||||||
3886 | if (!AA.SPMDCompatibilityTracker.isValidState()) | ||||||
3887 | return indicatePessimisticFixpoint(); | ||||||
3888 | |||||||
3889 | if (AA.SPMDCompatibilityTracker.isAssumed()) { | ||||||
3890 | if (AA.SPMDCompatibilityTracker.isAtFixpoint()) | ||||||
3891 | ++KnownSPMDCount; | ||||||
3892 | else | ||||||
3893 | ++AssumedSPMDCount; | ||||||
3894 | } else { | ||||||
3895 | if (AA.SPMDCompatibilityTracker.isAtFixpoint()) | ||||||
3896 | ++KnownNonSPMDCount; | ||||||
3897 | else | ||||||
3898 | ++AssumedNonSPMDCount; | ||||||
3899 | } | ||||||
3900 | } | ||||||
3901 | |||||||
3902 | if ((AssumedSPMDCount + KnownSPMDCount) && | ||||||
3903 | (AssumedNonSPMDCount + KnownNonSPMDCount)) | ||||||
3904 | return indicatePessimisticFixpoint(); | ||||||
3905 | |||||||
3906 | auto &Ctx = getAnchorValue().getContext(); | ||||||
3907 | // If the caller can only be reached by SPMD kernel entries, the parallel | ||||||
3908 | // level is 1. Similarly, if the caller can only be reached by non-SPMD | ||||||
3909 | // kernel entries, it is 0. | ||||||
3910 | if (AssumedSPMDCount || KnownSPMDCount) { | ||||||
3911 | assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&((void)0) | ||||||
3912 | "Expected only SPMD kernels!")((void)0); | ||||||
3913 | SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1); | ||||||
3914 | } else { | ||||||
3915 | assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&((void)0) | ||||||
3916 | "Expected only non-SPMD kernels!")((void)0); | ||||||
3917 | SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 0); | ||||||
3918 | } | ||||||
3919 | return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED | ||||||
3920 | : ChangeStatus::CHANGED; | ||||||
3921 | } | ||||||
3922 | |||||||
3923 | ChangeStatus foldKernelFnAttribute(Attributor &A, llvm::StringRef Attr) { | ||||||
3924 | // Specialize only if all the calls agree with the attribute constant value | ||||||
3925 | int32_t CurrentAttrValue = -1; | ||||||
3926 | Optional<Value *> SimplifiedValueBefore = SimplifiedValue; | ||||||
3927 | |||||||
3928 | auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>( | ||||||
3929 | *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED); | ||||||
3930 | |||||||
3931 | if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState()) | ||||||
3932 | return indicatePessimisticFixpoint(); | ||||||
3933 | |||||||
3934 | // Iterate over the kernels that reach this function | ||||||
3935 | for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) { | ||||||
3936 | int32_t NextAttrVal = -1; | ||||||
3937 | if (K->hasFnAttribute(Attr)) | ||||||
3938 | NextAttrVal = | ||||||
3939 | std::stoi(K->getFnAttribute(Attr).getValueAsString().str()); | ||||||
3940 | |||||||
3941 | if (NextAttrVal == -1 || | ||||||
3942 | (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal)) | ||||||
3943 | return indicatePessimisticFixpoint(); | ||||||
3944 | CurrentAttrValue = NextAttrVal; | ||||||
3945 | } | ||||||
3946 | |||||||
3947 | if (CurrentAttrValue != -1) { | ||||||
3948 | auto &Ctx = getAnchorValue().getContext(); | ||||||
3949 | SimplifiedValue = | ||||||
3950 | ConstantInt::get(Type::getInt32Ty(Ctx), CurrentAttrValue); | ||||||
3951 | } | ||||||
3952 | return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED | ||||||
3953 | : ChangeStatus::CHANGED; | ||||||
3954 | } | ||||||
3955 | |||||||
3956 | /// An optional value the associated value is assumed to fold to. That is, we | ||||||
3957 | /// assume the associated value (which is a call) can be replaced by this | ||||||
3958 | /// simplified value. | ||||||
3959 | Optional<Value *> SimplifiedValue; | ||||||
3960 | |||||||
3961 | /// The runtime function kind of the callee of the associated call site. | ||||||
3962 | RuntimeFunction RFKind; | ||||||
3963 | }; | ||||||
3964 | |||||||
3965 | } // namespace | ||||||
3966 | |||||||
3967 | /// Register folding callsite | ||||||
3968 | void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) { | ||||||
3969 | auto &RFI = OMPInfoCache.RFIs[RF]; | ||||||
3970 | RFI.foreachUse(SCC, [&](Use &U, Function &F) { | ||||||
3971 | CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI); | ||||||
3972 | if (!CI) | ||||||
3973 | return false; | ||||||
3974 | A.getOrCreateAAFor<AAFoldRuntimeCall>( | ||||||
3975 | IRPosition::callsite_returned(*CI), /* QueryingAA */ nullptr, | ||||||
3976 | DepClassTy::NONE, /* ForceUpdate */ false, | ||||||
3977 | /* UpdateAfterInit */ false); | ||||||
3978 | return false; | ||||||
3979 | }); | ||||||
3980 | } | ||||||
3981 | |||||||
3982 | void OpenMPOpt::registerAAs(bool IsModulePass) { | ||||||
3983 | if (SCC.empty()) | ||||||
3984 | |||||||
3985 | return; | ||||||
3986 | if (IsModulePass) { | ||||||
3987 | // Ensure we create the AAKernelInfo AAs first and without triggering an | ||||||
3988 | // update. This will make sure we register all value simplification | ||||||
3989 | // callbacks before any other AA has the chance to create an AAValueSimplify | ||||||
3990 | // or similar. | ||||||
3991 | for (Function *Kernel : OMPInfoCache.Kernels) | ||||||
3992 | A.getOrCreateAAFor<AAKernelInfo>( | ||||||
3993 | IRPosition::function(*Kernel), /* QueryingAA */ nullptr, | ||||||
3994 | DepClassTy::NONE, /* ForceUpdate */ false, | ||||||
3995 | /* UpdateAfterInit */ false); | ||||||
3996 | |||||||
3997 | |||||||
3998 | registerFoldRuntimeCall(OMPRTL___kmpc_is_generic_main_thread_id); | ||||||
3999 | registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode); | ||||||
4000 | registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level); | ||||||
4001 | registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block); | ||||||
4002 | registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks); | ||||||
4003 | } | ||||||
4004 | |||||||
4005 | // Create CallSite AA for all Getters. | ||||||
4006 | for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) { | ||||||
4007 | auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)]; | ||||||
4008 | |||||||
4009 | auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter]; | ||||||
4010 | |||||||
4011 | auto CreateAA = [&](Use &U, Function &Caller) { | ||||||
4012 | CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI); | ||||||
4013 | if (!CI) | ||||||
4014 | return false; | ||||||
4015 | |||||||
4016 | auto &CB = cast<CallBase>(*CI); | ||||||
4017 | |||||||
4018 | IRPosition CBPos = IRPosition::callsite_function(CB); | ||||||
4019 | A.getOrCreateAAFor<AAICVTracker>(CBPos); | ||||||
4020 | return false; | ||||||
4021 | }; | ||||||
4022 | |||||||
4023 | GetterRFI.foreachUse(SCC, CreateAA); | ||||||
4024 | } | ||||||
4025 | auto &GlobalizationRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared]; | ||||||
4026 | auto CreateAA = [&](Use &U, Function &F) { | ||||||
4027 | A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F)); | ||||||
4028 | return false; | ||||||
4029 | }; | ||||||
4030 | GlobalizationRFI.foreachUse(SCC, CreateAA); | ||||||
4031 | |||||||
4032 | // Create an ExecutionDomain AA for every function and a HeapToStack AA for | ||||||
4033 | // every function if there is a device kernel. | ||||||
4034 | if (!isOpenMPDevice(M)) | ||||||
4035 | return; | ||||||
4036 | |||||||
4037 | for (auto *F : SCC) { | ||||||
4038 | if (F->isDeclaration()) | ||||||
4039 | continue; | ||||||
4040 | |||||||
4041 | A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F)); | ||||||
4042 | A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F)); | ||||||
4043 | |||||||
4044 | for (auto &I : instructions(*F)) { | ||||||
4045 | if (auto *LI = dyn_cast<LoadInst>(&I)) { | ||||||
4046 | bool UsedAssumedInformation = false; | ||||||
4047 | A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr, | ||||||
4048 | UsedAssumedInformation); | ||||||
4049 | } | ||||||
4050 | } | ||||||
4051 | } | ||||||
4052 | } | ||||||
4053 | |||||||
4054 | const char AAICVTracker::ID = 0; | ||||||
4055 | const char AAKernelInfo::ID = 0; | ||||||
4056 | const char AAExecutionDomain::ID = 0; | ||||||
4057 | const char AAHeapToShared::ID = 0; | ||||||
4058 | const char AAFoldRuntimeCall::ID = 0; | ||||||
4059 | |||||||
4060 | AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP, | ||||||
4061 | Attributor &A) { | ||||||
4062 | AAICVTracker *AA = nullptr; | ||||||
4063 | switch (IRP.getPositionKind()) { | ||||||
4064 | case IRPosition::IRP_INVALID: | ||||||
4065 | case IRPosition::IRP_FLOAT: | ||||||
4066 | case IRPosition::IRP_ARGUMENT: | ||||||
4067 | case IRPosition::IRP_CALL_SITE_ARGUMENT: | ||||||
4068 | llvm_unreachable("ICVTracker can only be created for function position!")__builtin_unreachable(); | ||||||
4069 | case IRPosition::IRP_RETURNED: | ||||||
4070 | AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A); | ||||||
4071 | break; | ||||||
4072 | case IRPosition::IRP_CALL_SITE_RETURNED: | ||||||
4073 | AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A); | ||||||
4074 | break; | ||||||
4075 | case IRPosition::IRP_CALL_SITE: | ||||||
4076 | AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A); | ||||||
4077 | break; | ||||||
4078 | case IRPosition::IRP_FUNCTION: | ||||||
4079 | AA = new (A.Allocator) AAICVTrackerFunction(IRP, A); | ||||||
4080 | break; | ||||||
4081 | } | ||||||
4082 | |||||||
4083 | return *AA; | ||||||
4084 | } | ||||||
4085 | |||||||
4086 | AAExecutionDomain &AAExecutionDomain::createForPosition(const IRPosition &IRP, | ||||||
4087 | Attributor &A) { | ||||||
4088 | AAExecutionDomainFunction *AA = nullptr; | ||||||
4089 | switch (IRP.getPositionKind()) { | ||||||
4090 | case IRPosition::IRP_INVALID: | ||||||
4091 | case IRPosition::IRP_FLOAT: | ||||||
4092 | case IRPosition::IRP_ARGUMENT: | ||||||
4093 | case IRPosition::IRP_CALL_SITE_ARGUMENT: | ||||||
4094 | case IRPosition::IRP_RETURNED: | ||||||
4095 | case IRPosition::IRP_CALL_SITE_RETURNED: | ||||||
4096 | case IRPosition::IRP_CALL_SITE: | ||||||
4097 | llvm_unreachable(__builtin_unreachable() | ||||||
4098 | "AAExecutionDomain can only be created for function position!")__builtin_unreachable(); | ||||||
4099 | case IRPosition::IRP_FUNCTION: | ||||||
4100 | AA = new (A.Allocator) AAExecutionDomainFunction(IRP, A); | ||||||
4101 | break; | ||||||
4102 | } | ||||||
4103 | |||||||
4104 | return *AA; | ||||||
4105 | } | ||||||
4106 | |||||||
4107 | AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP, | ||||||
4108 | Attributor &A) { | ||||||
4109 | AAHeapToSharedFunction *AA = nullptr; | ||||||
4110 | switch (IRP.getPositionKind()) { | ||||||
4111 | case IRPosition::IRP_INVALID: | ||||||
4112 | case IRPosition::IRP_FLOAT: | ||||||
4113 | case IRPosition::IRP_ARGUMENT: | ||||||
4114 | case IRPosition::IRP_CALL_SITE_ARGUMENT: | ||||||
4115 | case IRPosition::IRP_RETURNED: | ||||||
4116 | case IRPosition::IRP_CALL_SITE_RETURNED: | ||||||
4117 | case IRPosition::IRP_CALL_SITE: | ||||||
4118 | llvm_unreachable(__builtin_unreachable() | ||||||
4119 | "AAHeapToShared can only be created for function position!")__builtin_unreachable(); | ||||||
4120 | case IRPosition::IRP_FUNCTION: | ||||||
4121 | AA = new (A.Allocator) AAHeapToSharedFunction(IRP, A); | ||||||
4122 | break; | ||||||
4123 | } | ||||||
4124 | |||||||
4125 | return *AA; | ||||||
4126 | } | ||||||
4127 | |||||||
4128 | AAKernelInfo &AAKernelInfo::createForPosition(const IRPosition &IRP, | ||||||
4129 | Attributor &A) { | ||||||
4130 | AAKernelInfo *AA = nullptr; | ||||||
4131 | switch (IRP.getPositionKind()) { | ||||||
4132 | case IRPosition::IRP_INVALID: | ||||||
4133 | case IRPosition::IRP_FLOAT: | ||||||
4134 | case IRPosition::IRP_ARGUMENT: | ||||||
4135 | case IRPosition::IRP_RETURNED: | ||||||
4136 | case IRPosition::IRP_CALL_SITE_RETURNED: | ||||||
4137 | case IRPosition::IRP_CALL_SITE_ARGUMENT: | ||||||
4138 | llvm_unreachable("KernelInfo can only be created for function position!")__builtin_unreachable(); | ||||||
4139 | case IRPosition::IRP_CALL_SITE: | ||||||
4140 | AA = new (A.Allocator) AAKernelInfoCallSite(IRP, A); | ||||||
4141 | break; | ||||||
4142 | case IRPosition::IRP_FUNCTION: | ||||||
4143 | AA = new (A.Allocator) AAKernelInfoFunction(IRP, A); | ||||||
4144 | break; | ||||||
4145 | } | ||||||
4146 | |||||||
4147 | return *AA; | ||||||
4148 | } | ||||||
4149 | |||||||
4150 | AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(const IRPosition &IRP, | ||||||
4151 | Attributor &A) { | ||||||
4152 | AAFoldRuntimeCall *AA = nullptr; | ||||||
4153 | switch (IRP.getPositionKind()) { | ||||||
4154 | case IRPosition::IRP_INVALID: | ||||||
4155 | case IRPosition::IRP_FLOAT: | ||||||
4156 | case IRPosition::IRP_ARGUMENT: | ||||||
4157 | case IRPosition::IRP_RETURNED: | ||||||
4158 | case IRPosition::IRP_FUNCTION: | ||||||
4159 | case IRPosition::IRP_CALL_SITE: | ||||||
4160 | case IRPosition::IRP_CALL_SITE_ARGUMENT: | ||||||
4161 | llvm_unreachable("KernelInfo can only be created for call site position!")__builtin_unreachable(); | ||||||
4162 | case IRPosition::IRP_CALL_SITE_RETURNED: | ||||||
4163 | AA = new (A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP, A); | ||||||
4164 | break; | ||||||
4165 | } | ||||||
4166 | |||||||
4167 | return *AA; | ||||||
4168 | } | ||||||
4169 | |||||||
4170 | PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { | ||||||
4171 | if (!containsOpenMP(M)) | ||||||
4172 | return PreservedAnalyses::all(); | ||||||
4173 | if (DisableOpenMPOptimizations) | ||||||
4174 | return PreservedAnalyses::all(); | ||||||
4175 | |||||||
4176 | FunctionAnalysisManager &FAM = | ||||||
4177 | AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); | ||||||
4178 | KernelSet Kernels = getDeviceKernels(M); | ||||||
4179 | |||||||
4180 | auto IsCalled = [&](Function &F) { | ||||||
4181 | if (Kernels.contains(&F)) | ||||||
4182 | return true; | ||||||
4183 | for (const User *U : F.users()) | ||||||
4184 | if (!isa<BlockAddress>(U)) | ||||||
4185 | return true; | ||||||
4186 | return false; | ||||||
4187 | }; | ||||||
4188 | |||||||
4189 | auto EmitRemark = [&](Function &F) { | ||||||
4190 | auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); | ||||||
4191 | ORE.emit([&]() { | ||||||
4192 | OptimizationRemarkAnalysis ORA(DEBUG_TYPE"openmp-opt", "OMP140", &F); | ||||||
4193 | return ORA << "Could not internalize function. " | ||||||
4194 | << "Some optimizations may not be possible. [OMP140]"; | ||||||
4195 | }); | ||||||
4196 | }; | ||||||
4197 | |||||||
4198 | // Create internal copies of each function if this is a kernel Module. This | ||||||
4199 | // allows iterprocedural passes to see every call edge. | ||||||
4200 | DenseMap<Function *, Function *> InternalizedMap; | ||||||
4201 | if (isOpenMPDevice(M)) { | ||||||
4202 | SmallPtrSet<Function *, 16> InternalizeFns; | ||||||
4203 | for (Function &F : M) | ||||||
4204 | if (!F.isDeclaration() && !Kernels.contains(&F) && IsCalled(F) && | ||||||
4205 | !DisableInternalization) { | ||||||
4206 | if (Attributor::isInternalizable(F)) { | ||||||
4207 | InternalizeFns.insert(&F); | ||||||
4208 | } else if (!F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::Cold)) { | ||||||
4209 | EmitRemark(F); | ||||||
4210 | } | ||||||
4211 | } | ||||||
4212 | |||||||
4213 | Attributor::internalizeFunctions(InternalizeFns, InternalizedMap); | ||||||
4214 | } | ||||||
4215 | |||||||
4216 | // Look at every function in the Module unless it was internalized. | ||||||
4217 | SmallVector<Function *, 16> SCC; | ||||||
4218 | for (Function &F : M) | ||||||
4219 | if (!F.isDeclaration() && !InternalizedMap.lookup(&F)) | ||||||
4220 | SCC.push_back(&F); | ||||||
4221 | |||||||
4222 | if (SCC.empty()) | ||||||
4223 | return PreservedAnalyses::all(); | ||||||
4224 | |||||||
4225 | AnalysisGetter AG(FAM); | ||||||
4226 | |||||||
4227 | auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & { | ||||||
4228 | return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); | ||||||
4229 | }; | ||||||
4230 | |||||||
4231 | BumpPtrAllocator Allocator; | ||||||
4232 | CallGraphUpdater CGUpdater; | ||||||
4233 | |||||||
4234 | SetVector<Function *> Functions(SCC.begin(), SCC.end()); | ||||||
4235 | OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions, Kernels); | ||||||
4236 | |||||||
4237 | unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32; | ||||||
4238 | Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false, | ||||||
4239 | MaxFixpointIterations, OREGetter, DEBUG_TYPE"openmp-opt"); | ||||||
4240 | |||||||
4241 | OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); | ||||||
4242 | bool Changed = OMPOpt.run(true); | ||||||
4243 | if (Changed) | ||||||
4244 | return PreservedAnalyses::none(); | ||||||
4245 | |||||||
4246 | return PreservedAnalyses::all(); | ||||||
4247 | } | ||||||
4248 | |||||||
4249 | PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C, | ||||||
4250 | CGSCCAnalysisManager &AM, | ||||||
4251 | LazyCallGraph &CG, | ||||||
4252 | CGSCCUpdateResult &UR) { | ||||||
4253 | if (!containsOpenMP(*C.begin()->getFunction().getParent())) | ||||||
4254 | return PreservedAnalyses::all(); | ||||||
4255 | if (DisableOpenMPOptimizations) | ||||||
4256 | return PreservedAnalyses::all(); | ||||||
4257 | |||||||
4258 | SmallVector<Function *, 16> SCC; | ||||||
4259 | // If there are kernels in the module, we have to run on all SCC's. | ||||||
4260 | for (LazyCallGraph::Node &N : C) { | ||||||
4261 | Function *Fn = &N.getFunction(); | ||||||
4262 | SCC.push_back(Fn); | ||||||
4263 | } | ||||||
4264 | |||||||
4265 | if (SCC.empty()) | ||||||
4266 | return PreservedAnalyses::all(); | ||||||
4267 | |||||||
4268 | Module &M = *C.begin()->getFunction().getParent(); | ||||||
4269 | |||||||
4270 | KernelSet Kernels = getDeviceKernels(M); | ||||||
4271 | |||||||
4272 | FunctionAnalysisManager &FAM = | ||||||
4273 | AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager(); | ||||||
4274 | |||||||
4275 | AnalysisGetter AG(FAM); | ||||||
4276 | |||||||
4277 | auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & { | ||||||
4278 | return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); | ||||||
4279 | }; | ||||||
4280 | |||||||
4281 | BumpPtrAllocator Allocator; | ||||||
4282 | CallGraphUpdater CGUpdater; | ||||||
4283 | CGUpdater.initialize(CG, C, AM, UR); | ||||||
4284 | |||||||
4285 | SetVector<Function *> Functions(SCC.begin(), SCC.end()); | ||||||
4286 | OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator, | ||||||
4287 | /*CGSCC*/ Functions, Kernels); | ||||||
4288 | |||||||
4289 | unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32; | ||||||
4290 | Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true, | ||||||
4291 | MaxFixpointIterations, OREGetter, DEBUG_TYPE"openmp-opt"); | ||||||
4292 | |||||||
4293 | OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); | ||||||
4294 | bool Changed = OMPOpt.run(false); | ||||||
4295 | if (Changed) | ||||||
4296 | return PreservedAnalyses::none(); | ||||||
4297 | |||||||
4298 | return PreservedAnalyses::all(); | ||||||
4299 | } | ||||||
4300 | |||||||
4301 | namespace { | ||||||
4302 | |||||||
4303 | struct OpenMPOptCGSCCLegacyPass : public CallGraphSCCPass { | ||||||
4304 | CallGraphUpdater CGUpdater; | ||||||
4305 | static char ID; | ||||||
4306 | |||||||
4307 | OpenMPOptCGSCCLegacyPass() : CallGraphSCCPass(ID) { | ||||||
4308 | initializeOpenMPOptCGSCCLegacyPassPass(*PassRegistry::getPassRegistry()); | ||||||
4309 | } | ||||||
4310 | |||||||
4311 | void getAnalysisUsage(AnalysisUsage &AU) const override { | ||||||
4312 | CallGraphSCCPass::getAnalysisUsage(AU); | ||||||
4313 | } | ||||||
4314 | |||||||
4315 | bool runOnSCC(CallGraphSCC &CGSCC) override { | ||||||
4316 | if (!containsOpenMP(CGSCC.getCallGraph().getModule())) | ||||||
4317 | return false; | ||||||
4318 | if (DisableOpenMPOptimizations || skipSCC(CGSCC)) | ||||||
4319 | return false; | ||||||
4320 | |||||||
4321 | SmallVector<Function *, 16> SCC; | ||||||
4322 | // If there are kernels in the module, we have to run on all SCC's. | ||||||
4323 | for (CallGraphNode *CGN : CGSCC) { | ||||||
4324 | Function *Fn = CGN->getFunction(); | ||||||
4325 | if (!Fn || Fn->isDeclaration()) | ||||||
4326 | continue; | ||||||
4327 | SCC.push_back(Fn); | ||||||
4328 | } | ||||||
4329 | |||||||
4330 | if (SCC.empty()) | ||||||
4331 | return false; | ||||||
4332 | |||||||
4333 | Module &M = CGSCC.getCallGraph().getModule(); | ||||||
4334 | KernelSet Kernels = getDeviceKernels(M); | ||||||
4335 | |||||||
4336 | CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); | ||||||
4337 | CGUpdater.initialize(CG, CGSCC); | ||||||
4338 | |||||||
4339 | // Maintain a map of functions to avoid rebuilding the ORE | ||||||
4340 | DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap; | ||||||
4341 | auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & { | ||||||
4342 | std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F]; | ||||||
4343 | if (!ORE) | ||||||
4344 | ORE = std::make_unique<OptimizationRemarkEmitter>(F); | ||||||
4345 | return *ORE; | ||||||
4346 | }; | ||||||
4347 | |||||||
4348 | AnalysisGetter AG; | ||||||
4349 | SetVector<Function *> Functions(SCC.begin(), SCC.end()); | ||||||
4350 | BumpPtrAllocator Allocator; | ||||||
4351 | OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, | ||||||
4352 | Allocator, | ||||||
4353 | /*CGSCC*/ Functions, Kernels); | ||||||
4354 | |||||||
4355 | unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32; | ||||||
4356 | Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true, | ||||||
4357 | MaxFixpointIterations, OREGetter, DEBUG_TYPE"openmp-opt"); | ||||||
4358 | |||||||
4359 | OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A); | ||||||
4360 | return OMPOpt.run(false); | ||||||
4361 | } | ||||||
4362 | |||||||
4363 | bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); } | ||||||
4364 | }; | ||||||
4365 | |||||||
4366 | } // end anonymous namespace | ||||||
4367 | |||||||
4368 | KernelSet llvm::omp::getDeviceKernels(Module &M) { | ||||||
4369 | // TODO: Create a more cross-platform way of determining device kernels. | ||||||
4370 | NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); | ||||||
4371 | KernelSet Kernels; | ||||||
4372 | |||||||
4373 | if (!MD) | ||||||
4374 | return Kernels; | ||||||
4375 | |||||||
4376 | for (auto *Op : MD->operands()) { | ||||||
4377 | if (Op->getNumOperands() < 2) | ||||||
4378 | continue; | ||||||
4379 | MDString *KindID = dyn_cast<MDString>(Op->getOperand(1)); | ||||||
4380 | if (!KindID || KindID->getString() != "kernel") | ||||||
4381 | continue; | ||||||
4382 | |||||||
4383 | Function *KernelFn = | ||||||
4384 | mdconst::dyn_extract_or_null<Function>(Op->getOperand(0)); | ||||||
4385 | if (!KernelFn) | ||||||
4386 | continue; | ||||||
4387 | |||||||
4388 | ++NumOpenMPTargetRegionKernels; | ||||||
4389 | |||||||
4390 | Kernels.insert(KernelFn); | ||||||
4391 | } | ||||||
4392 | |||||||
4393 | return Kernels; | ||||||
4394 | } | ||||||
4395 | |||||||
4396 | bool llvm::omp::containsOpenMP(Module &M) { | ||||||
4397 | Metadata *MD = M.getModuleFlag("openmp"); | ||||||
4398 | if (!MD) | ||||||
4399 | return false; | ||||||
4400 | |||||||
4401 | return true; | ||||||
4402 | } | ||||||
4403 | |||||||
4404 | bool llvm::omp::isOpenMPDevice(Module &M) { | ||||||
4405 | Metadata *MD = M.getModuleFlag("openmp-device"); | ||||||
4406 | if (!MD) | ||||||
4407 | return false; | ||||||
4408 | |||||||
4409 | return true; | ||||||
4410 | } | ||||||
4411 | |||||||
4412 | char OpenMPOptCGSCCLegacyPass::ID = 0; | ||||||
4413 | |||||||
4414 | INITIALIZE_PASS_BEGIN(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc",static void *initializeOpenMPOptCGSCCLegacyPassPassOnce(PassRegistry &Registry) { | ||||||
4415 | "OpenMP specific optimizations", false, false)static void *initializeOpenMPOptCGSCCLegacyPassPassOnce(PassRegistry &Registry) { | ||||||
4416 | INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)initializeCallGraphWrapperPassPass(Registry); | ||||||
4417 | INITIALIZE_PASS_END(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc",PassInfo *PI = new PassInfo( "OpenMP specific optimizations", "openmp-opt-cgscc", &OpenMPOptCGSCCLegacyPass::ID, PassInfo ::NormalCtor_t(callDefaultCtor<OpenMPOptCGSCCLegacyPass> ), false, false); Registry.registerPass(*PI, true); return PI ; } static llvm::once_flag InitializeOpenMPOptCGSCCLegacyPassPassFlag ; void llvm::initializeOpenMPOptCGSCCLegacyPassPass(PassRegistry &Registry) { llvm::call_once(InitializeOpenMPOptCGSCCLegacyPassPassFlag , initializeOpenMPOptCGSCCLegacyPassPassOnce, std::ref(Registry )); } | ||||||
4418 | "OpenMP specific optimizations", false, false)PassInfo *PI = new PassInfo( "OpenMP specific optimizations", "openmp-opt-cgscc", &OpenMPOptCGSCCLegacyPass::ID, PassInfo ::NormalCtor_t(callDefaultCtor<OpenMPOptCGSCCLegacyPass> ), false, false); Registry.registerPass(*PI, true); return PI ; } static llvm::once_flag InitializeOpenMPOptCGSCCLegacyPassPassFlag ; void llvm::initializeOpenMPOptCGSCCLegacyPassPass(PassRegistry &Registry) { llvm::call_once(InitializeOpenMPOptCGSCCLegacyPassPassFlag , initializeOpenMPOptCGSCCLegacyPassPassOnce, std::ref(Registry )); } | ||||||
4419 | |||||||
4420 | Pass *llvm::createOpenMPOptCGSCCLegacyPass() { | ||||||
4421 | return new OpenMPOptCGSCCLegacyPass(); | ||||||
4422 | } |
1 | //===- llvm/ADT/SmallPtrSet.h - 'Normally small' pointer set ----*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines the SmallPtrSet class. See the doxygen comment for |
10 | // SmallPtrSetImplBase for more details on the algorithm used. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #ifndef LLVM_ADT_SMALLPTRSET_H |
15 | #define LLVM_ADT_SMALLPTRSET_H |
16 | |
17 | #include "llvm/ADT/EpochTracker.h" |
18 | #include "llvm/Support/Compiler.h" |
19 | #include "llvm/Support/ReverseIteration.h" |
20 | #include "llvm/Support/type_traits.h" |
21 | #include <cassert> |
22 | #include <cstddef> |
23 | #include <cstdlib> |
24 | #include <cstring> |
25 | #include <initializer_list> |
26 | #include <iterator> |
27 | #include <utility> |
28 | |
29 | namespace llvm { |
30 | |
31 | /// SmallPtrSetImplBase - This is the common code shared among all the |
32 | /// SmallPtrSet<>'s, which is almost everything. SmallPtrSet has two modes, one |
33 | /// for small and one for large sets. |
34 | /// |
35 | /// Small sets use an array of pointers allocated in the SmallPtrSet object, |
36 | /// which is treated as a simple array of pointers. When a pointer is added to |
37 | /// the set, the array is scanned to see if the element already exists, if not |
38 | /// the element is 'pushed back' onto the array. If we run out of space in the |
39 | /// array, we grow into the 'large set' case. SmallSet should be used when the |
40 | /// sets are often small. In this case, no memory allocation is used, and only |
41 | /// light-weight and cache-efficient scanning is used. |
42 | /// |
43 | /// Large sets use a classic exponentially-probed hash table. Empty buckets are |
44 | /// represented with an illegal pointer value (-1) to allow null pointers to be |
45 | /// inserted. Tombstones are represented with another illegal pointer value |
46 | /// (-2), to allow deletion. The hash table is resized when the table is 3/4 or |
47 | /// more. When this happens, the table is doubled in size. |
48 | /// |
49 | class SmallPtrSetImplBase : public DebugEpochBase { |
50 | friend class SmallPtrSetIteratorImpl; |
51 | |
52 | protected: |
53 | /// SmallArray - Points to a fixed size set of buckets, used in 'small mode'. |
54 | const void **SmallArray; |
55 | /// CurArray - This is the current set of buckets. If equal to SmallArray, |
56 | /// then the set is in 'small mode'. |
57 | const void **CurArray; |
58 | /// CurArraySize - The allocated size of CurArray, always a power of two. |
59 | unsigned CurArraySize; |
60 | |
61 | /// Number of elements in CurArray that contain a value or are a tombstone. |
62 | /// If small, all these elements are at the beginning of CurArray and the rest |
63 | /// is uninitialized. |
64 | unsigned NumNonEmpty; |
65 | /// Number of tombstones in CurArray. |
66 | unsigned NumTombstones; |
67 | |
68 | // Helpers to copy and move construct a SmallPtrSet. |
69 | SmallPtrSetImplBase(const void **SmallStorage, |
70 | const SmallPtrSetImplBase &that); |
71 | SmallPtrSetImplBase(const void **SmallStorage, unsigned SmallSize, |
72 | SmallPtrSetImplBase &&that); |
73 | |
74 | explicit SmallPtrSetImplBase(const void **SmallStorage, unsigned SmallSize) |
75 | : SmallArray(SmallStorage), CurArray(SmallStorage), |
76 | CurArraySize(SmallSize), NumNonEmpty(0), NumTombstones(0) { |
77 | assert(SmallSize && (SmallSize & (SmallSize-1)) == 0 &&((void)0) |
78 | "Initial size must be a power of two!")((void)0); |
79 | } |
80 | |
81 | ~SmallPtrSetImplBase() { |
82 | if (!isSmall()) |
83 | free(CurArray); |
84 | } |
85 | |
86 | public: |
87 | using size_type = unsigned; |
88 | |
89 | SmallPtrSetImplBase &operator=(const SmallPtrSetImplBase &) = delete; |
90 | |
91 | LLVM_NODISCARD[[clang::warn_unused_result]] bool empty() const { return size() == 0; } |
92 | size_type size() const { return NumNonEmpty - NumTombstones; } |
93 | |
94 | void clear() { |
95 | incrementEpoch(); |
96 | // If the capacity of the array is huge, and the # elements used is small, |
97 | // shrink the array. |
98 | if (!isSmall()) { |
99 | if (size() * 4 < CurArraySize && CurArraySize > 32) |
100 | return shrink_and_clear(); |
101 | // Fill the array with empty markers. |
102 | memset(CurArray, -1, CurArraySize * sizeof(void *)); |
103 | } |
104 | |
105 | NumNonEmpty = 0; |
106 | NumTombstones = 0; |
107 | } |
108 | |
109 | protected: |
110 | static void *getTombstoneMarker() { return reinterpret_cast<void*>(-2); } |
111 | |
112 | static void *getEmptyMarker() { |
113 | // Note that -1 is chosen to make clear() efficiently implementable with |
114 | // memset and because it's not a valid pointer value. |
115 | return reinterpret_cast<void*>(-1); |
116 | } |
117 | |
118 | const void **EndPointer() const { |
119 | return isSmall() ? CurArray + NumNonEmpty : CurArray + CurArraySize; |
120 | } |
121 | |
122 | /// insert_imp - This returns true if the pointer was new to the set, false if |
123 | /// it was already in the set. This is hidden from the client so that the |
124 | /// derived class can check that the right type of pointer is passed in. |
125 | std::pair<const void *const *, bool> insert_imp(const void *Ptr) { |
126 | if (isSmall()) { |
127 | // Check to see if it is already in the set. |
128 | const void **LastTombstone = nullptr; |
129 | for (const void **APtr = SmallArray, **E = SmallArray + NumNonEmpty; |
130 | APtr != E; ++APtr) { |
131 | const void *Value = *APtr; |
132 | if (Value == Ptr) |
133 | return std::make_pair(APtr, false); |
134 | if (Value == getTombstoneMarker()) |
135 | LastTombstone = APtr; |
136 | } |
137 | |
138 | // Did we find any tombstone marker? |
139 | if (LastTombstone != nullptr) { |
140 | *LastTombstone = Ptr; |
141 | --NumTombstones; |
142 | incrementEpoch(); |
143 | return std::make_pair(LastTombstone, true); |
144 | } |
145 | |
146 | // Nope, there isn't. If we stay small, just 'pushback' now. |
147 | if (NumNonEmpty < CurArraySize) { |
148 | SmallArray[NumNonEmpty++] = Ptr; |
149 | incrementEpoch(); |
150 | return std::make_pair(SmallArray + (NumNonEmpty - 1), true); |
151 | } |
152 | // Otherwise, hit the big set case, which will call grow. |
153 | } |
154 | return insert_imp_big(Ptr); |
155 | } |
156 | |
157 | /// erase_imp - If the set contains the specified pointer, remove it and |
158 | /// return true, otherwise return false. This is hidden from the client so |
159 | /// that the derived class can check that the right type of pointer is passed |
160 | /// in. |
161 | bool erase_imp(const void * Ptr) { |
162 | const void *const *P = find_imp(Ptr); |
163 | if (P == EndPointer()) |
164 | return false; |
165 | |
166 | const void **Loc = const_cast<const void **>(P); |
167 | assert(*Loc == Ptr && "broken find!")((void)0); |
168 | *Loc = getTombstoneMarker(); |
169 | NumTombstones++; |
170 | return true; |
171 | } |
172 | |
173 | /// Returns the raw pointer needed to construct an iterator. If element not |
174 | /// found, this will be EndPointer. Otherwise, it will be a pointer to the |
175 | /// slot which stores Ptr; |
176 | const void *const * find_imp(const void * Ptr) const { |
177 | if (isSmall()) { |
178 | // Linear search for the item. |
179 | for (const void *const *APtr = SmallArray, |
180 | *const *E = SmallArray + NumNonEmpty; APtr != E; ++APtr) |
181 | if (*APtr == Ptr) |
182 | return APtr; |
183 | return EndPointer(); |
184 | } |
185 | |
186 | // Big set case. |
187 | auto *Bucket = FindBucketFor(Ptr); |
188 | if (*Bucket == Ptr) |
189 | return Bucket; |
190 | return EndPointer(); |
191 | } |
192 | |
193 | private: |
194 | bool isSmall() const { return CurArray == SmallArray; } |
195 | |
196 | std::pair<const void *const *, bool> insert_imp_big(const void *Ptr); |
197 | |
198 | const void * const *FindBucketFor(const void *Ptr) const; |
199 | void shrink_and_clear(); |
200 | |
201 | /// Grow - Allocate a larger backing store for the buckets and move it over. |
202 | void Grow(unsigned NewSize); |
203 | |
204 | protected: |
205 | /// swap - Swaps the elements of two sets. |
206 | /// Note: This method assumes that both sets have the same small size. |
207 | void swap(SmallPtrSetImplBase &RHS); |
208 | |
209 | void CopyFrom(const SmallPtrSetImplBase &RHS); |
210 | void MoveFrom(unsigned SmallSize, SmallPtrSetImplBase &&RHS); |
211 | |
212 | private: |
213 | /// Code shared by MoveFrom() and move constructor. |
214 | void MoveHelper(unsigned SmallSize, SmallPtrSetImplBase &&RHS); |
215 | /// Code shared by CopyFrom() and copy constructor. |
216 | void CopyHelper(const SmallPtrSetImplBase &RHS); |
217 | }; |
218 | |
219 | /// SmallPtrSetIteratorImpl - This is the common base class shared between all |
220 | /// instances of SmallPtrSetIterator. |
221 | class SmallPtrSetIteratorImpl { |
222 | protected: |
223 | const void *const *Bucket; |
224 | const void *const *End; |
225 | |
226 | public: |
227 | explicit SmallPtrSetIteratorImpl(const void *const *BP, const void*const *E) |
228 | : Bucket(BP), End(E) { |
229 | if (shouldReverseIterate()) { |
230 | RetreatIfNotValid(); |
231 | return; |
232 | } |
233 | AdvanceIfNotValid(); |
234 | } |
235 | |
236 | bool operator==(const SmallPtrSetIteratorImpl &RHS) const { |
237 | return Bucket == RHS.Bucket; |
238 | } |
239 | bool operator!=(const SmallPtrSetIteratorImpl &RHS) const { |
240 | return Bucket != RHS.Bucket; |
241 | } |
242 | |
243 | protected: |
244 | /// AdvanceIfNotValid - If the current bucket isn't valid, advance to a bucket |
245 | /// that is. This is guaranteed to stop because the end() bucket is marked |
246 | /// valid. |
247 | void AdvanceIfNotValid() { |
248 | assert(Bucket <= End)((void)0); |
249 | while (Bucket != End && |
250 | (*Bucket == SmallPtrSetImplBase::getEmptyMarker() || |
251 | *Bucket == SmallPtrSetImplBase::getTombstoneMarker())) |
252 | ++Bucket; |
253 | } |
254 | void RetreatIfNotValid() { |
255 | assert(Bucket >= End)((void)0); |
256 | while (Bucket != End && |
257 | (Bucket[-1] == SmallPtrSetImplBase::getEmptyMarker() || |
258 | Bucket[-1] == SmallPtrSetImplBase::getTombstoneMarker())) { |
259 | --Bucket; |
260 | } |
261 | } |
262 | }; |
263 | |
264 | /// SmallPtrSetIterator - This implements a const_iterator for SmallPtrSet. |
265 | template <typename PtrTy> |
266 | class SmallPtrSetIterator : public SmallPtrSetIteratorImpl, |
267 | DebugEpochBase::HandleBase { |
268 | using PtrTraits = PointerLikeTypeTraits<PtrTy>; |
269 | |
270 | public: |
271 | using value_type = PtrTy; |
272 | using reference = PtrTy; |
273 | using pointer = PtrTy; |
274 | using difference_type = std::ptrdiff_t; |
275 | using iterator_category = std::forward_iterator_tag; |
276 | |
277 | explicit SmallPtrSetIterator(const void *const *BP, const void *const *E, |
278 | const DebugEpochBase &Epoch) |
279 | : SmallPtrSetIteratorImpl(BP, E), DebugEpochBase::HandleBase(&Epoch) {} |
280 | |
281 | // Most methods are provided by the base class. |
282 | |
283 | const PtrTy operator*() const { |
284 | assert(isHandleInSync() && "invalid iterator access!")((void)0); |
285 | if (shouldReverseIterate()) { |
286 | assert(Bucket > End)((void)0); |
287 | return PtrTraits::getFromVoidPointer(const_cast<void *>(Bucket[-1])); |
288 | } |
289 | assert(Bucket < End)((void)0); |
290 | return PtrTraits::getFromVoidPointer(const_cast<void*>(*Bucket)); |
291 | } |
292 | |
293 | inline SmallPtrSetIterator& operator++() { // Preincrement |
294 | assert(isHandleInSync() && "invalid iterator access!")((void)0); |
295 | if (shouldReverseIterate()) { |
296 | --Bucket; |
297 | RetreatIfNotValid(); |
298 | return *this; |
299 | } |
300 | ++Bucket; |
301 | AdvanceIfNotValid(); |
302 | return *this; |
303 | } |
304 | |
305 | SmallPtrSetIterator operator++(int) { // Postincrement |
306 | SmallPtrSetIterator tmp = *this; |
307 | ++*this; |
308 | return tmp; |
309 | } |
310 | }; |
311 | |
312 | /// RoundUpToPowerOfTwo - This is a helper template that rounds N up to the next |
313 | /// power of two (which means N itself if N is already a power of two). |
314 | template<unsigned N> |
315 | struct RoundUpToPowerOfTwo; |
316 | |
317 | /// RoundUpToPowerOfTwoH - If N is not a power of two, increase it. This is a |
318 | /// helper template used to implement RoundUpToPowerOfTwo. |
319 | template<unsigned N, bool isPowerTwo> |
320 | struct RoundUpToPowerOfTwoH { |
321 | enum { Val = N }; |
322 | }; |
323 | template<unsigned N> |
324 | struct RoundUpToPowerOfTwoH<N, false> { |
325 | enum { |
326 | // We could just use NextVal = N+1, but this converges faster. N|(N-1) sets |
327 | // the right-most zero bits to one all at once, e.g. 0b0011000 -> 0b0011111. |
328 | Val = RoundUpToPowerOfTwo<(N|(N-1)) + 1>::Val |
329 | }; |
330 | }; |
331 | |
332 | template<unsigned N> |
333 | struct RoundUpToPowerOfTwo { |
334 | enum { Val = RoundUpToPowerOfTwoH<N, (N&(N-1)) == 0>::Val }; |
335 | }; |
336 | |
337 | /// A templated base class for \c SmallPtrSet which provides the |
338 | /// typesafe interface that is common across all small sizes. |
339 | /// |
340 | /// This is particularly useful for passing around between interface boundaries |
341 | /// to avoid encoding a particular small size in the interface boundary. |
342 | template <typename PtrType> |
343 | class SmallPtrSetImpl : public SmallPtrSetImplBase { |
344 | using ConstPtrType = typename add_const_past_pointer<PtrType>::type; |
345 | using PtrTraits = PointerLikeTypeTraits<PtrType>; |
346 | using ConstPtrTraits = PointerLikeTypeTraits<ConstPtrType>; |
347 | |
348 | protected: |
349 | // Forward constructors to the base. |
350 | using SmallPtrSetImplBase::SmallPtrSetImplBase; |
351 | |
352 | public: |
353 | using iterator = SmallPtrSetIterator<PtrType>; |
354 | using const_iterator = SmallPtrSetIterator<PtrType>; |
355 | using key_type = ConstPtrType; |
356 | using value_type = PtrType; |
357 | |
358 | SmallPtrSetImpl(const SmallPtrSetImpl &) = delete; |
359 | |
360 | /// Inserts Ptr if and only if there is no element in the container equal to |
361 | /// Ptr. The bool component of the returned pair is true if and only if the |
362 | /// insertion takes place, and the iterator component of the pair points to |
363 | /// the element equal to Ptr. |
364 | std::pair<iterator, bool> insert(PtrType Ptr) { |
365 | auto p = insert_imp(PtrTraits::getAsVoidPointer(Ptr)); |
366 | return std::make_pair(makeIterator(p.first), p.second); |
367 | } |
368 | |
369 | /// Insert the given pointer with an iterator hint that is ignored. This is |
370 | /// identical to calling insert(Ptr), but allows SmallPtrSet to be used by |
371 | /// std::insert_iterator and std::inserter(). |
372 | iterator insert(iterator, PtrType Ptr) { |
373 | return insert(Ptr).first; |
374 | } |
375 | |
376 | /// erase - If the set contains the specified pointer, remove it and return |
377 | /// true, otherwise return false. |
378 | bool erase(PtrType Ptr) { |
379 | return erase_imp(PtrTraits::getAsVoidPointer(Ptr)); |
380 | } |
381 | /// count - Return 1 if the specified pointer is in the set, 0 otherwise. |
382 | size_type count(ConstPtrType Ptr) const { |
383 | return find_imp(ConstPtrTraits::getAsVoidPointer(Ptr)) != EndPointer(); |
384 | } |
385 | iterator find(ConstPtrType Ptr) const { |
386 | return makeIterator(find_imp(ConstPtrTraits::getAsVoidPointer(Ptr))); |
387 | } |
388 | bool contains(ConstPtrType Ptr) const { |
389 | return find_imp(ConstPtrTraits::getAsVoidPointer(Ptr)) != EndPointer(); |
390 | } |
391 | |
392 | template <typename IterT> |
393 | void insert(IterT I, IterT E) { |
394 | for (; I != E; ++I) |
395 | insert(*I); |
396 | } |
397 | |
398 | void insert(std::initializer_list<PtrType> IL) { |
399 | insert(IL.begin(), IL.end()); |
400 | } |
401 | |
402 | iterator begin() const { |
403 | if (shouldReverseIterate()) |
404 | return makeIterator(EndPointer() - 1); |
405 | return makeIterator(CurArray); |
406 | } |
407 | iterator end() const { return makeIterator(EndPointer()); } |
408 | |
409 | private: |
410 | /// Create an iterator that dereferences to same place as the given pointer. |
411 | iterator makeIterator(const void *const *P) const { |
412 | if (shouldReverseIterate()) |
413 | return iterator(P == EndPointer() ? CurArray : P + 1, CurArray, *this); |
414 | return iterator(P, EndPointer(), *this); |
415 | } |
416 | }; |
417 | |
418 | /// Equality comparison for SmallPtrSet. |
419 | /// |
420 | /// Iterates over elements of LHS confirming that each value from LHS is also in |
421 | /// RHS, and that no additional values are in RHS. |
422 | template <typename PtrType> |
423 | bool operator==(const SmallPtrSetImpl<PtrType> &LHS, |
424 | const SmallPtrSetImpl<PtrType> &RHS) { |
425 | if (LHS.size() != RHS.size()) |
426 | return false; |
427 | |
428 | for (const auto *KV : LHS) |
429 | if (!RHS.count(KV)) |
430 | return false; |
431 | |
432 | return true; |
433 | } |
434 | |
435 | /// Inequality comparison for SmallPtrSet. |
436 | /// |
437 | /// Equivalent to !(LHS == RHS). |
438 | template <typename PtrType> |
439 | bool operator!=(const SmallPtrSetImpl<PtrType> &LHS, |
440 | const SmallPtrSetImpl<PtrType> &RHS) { |
441 | return !(LHS == RHS); |
442 | } |
443 | |
444 | /// SmallPtrSet - This class implements a set which is optimized for holding |
445 | /// SmallSize or less elements. This internally rounds up SmallSize to the next |
446 | /// power of two if it is not already a power of two. See the comments above |
447 | /// SmallPtrSetImplBase for details of the algorithm. |
448 | template<class PtrType, unsigned SmallSize> |
449 | class SmallPtrSet : public SmallPtrSetImpl<PtrType> { |
450 | // In small mode SmallPtrSet uses linear search for the elements, so it is |
451 | // not a good idea to choose this value too high. You may consider using a |
452 | // DenseSet<> instead if you expect many elements in the set. |
453 | static_assert(SmallSize <= 32, "SmallSize should be small"); |
454 | |
455 | using BaseT = SmallPtrSetImpl<PtrType>; |
456 | |
457 | // Make sure that SmallSize is a power of two, round up if not. |
458 | enum { SmallSizePowTwo = RoundUpToPowerOfTwo<SmallSize>::Val }; |
459 | /// SmallStorage - Fixed size storage used in 'small mode'. |
460 | const void *SmallStorage[SmallSizePowTwo]; |
461 | |
462 | public: |
463 | SmallPtrSet() : BaseT(SmallStorage, SmallSizePowTwo) {} |
464 | SmallPtrSet(const SmallPtrSet &that) : BaseT(SmallStorage, that) {} |
465 | SmallPtrSet(SmallPtrSet &&that) |
466 | : BaseT(SmallStorage, SmallSizePowTwo, std::move(that)) {} |
467 | |
468 | template<typename It> |
469 | SmallPtrSet(It I, It E) : BaseT(SmallStorage, SmallSizePowTwo) { |
470 | this->insert(I, E); |
471 | } |
472 | |
473 | SmallPtrSet(std::initializer_list<PtrType> IL) |
474 | : BaseT(SmallStorage, SmallSizePowTwo) { |
475 | this->insert(IL.begin(), IL.end()); |
476 | } |
477 | |
478 | SmallPtrSet<PtrType, SmallSize> & |
479 | operator=(const SmallPtrSet<PtrType, SmallSize> &RHS) { |
480 | if (&RHS != this) |
481 | this->CopyFrom(RHS); |
482 | return *this; |
483 | } |
484 | |
485 | SmallPtrSet<PtrType, SmallSize> & |
486 | operator=(SmallPtrSet<PtrType, SmallSize> &&RHS) { |
487 | if (&RHS != this) |
488 | this->MoveFrom(SmallSizePowTwo, std::move(RHS)); |
489 | return *this; |
490 | } |
491 | |
492 | SmallPtrSet<PtrType, SmallSize> & |
493 | operator=(std::initializer_list<PtrType> IL) { |
494 | this->clear(); |
495 | this->insert(IL.begin(), IL.end()); |
496 | return *this; |
497 | } |
498 | |
499 | /// swap - Swaps the elements of two sets. |
500 | void swap(SmallPtrSet<PtrType, SmallSize> &RHS) { |
501 | SmallPtrSetImplBase::swap(RHS); |
502 | } |
503 | }; |
504 | |
505 | } // end namespace llvm |
506 | |
507 | namespace std { |
508 | |
509 | /// Implement std::swap in terms of SmallPtrSet swap. |
510 | template<class T, unsigned N> |
511 | inline void swap(llvm::SmallPtrSet<T, N> &LHS, llvm::SmallPtrSet<T, N> &RHS) { |
512 | LHS.swap(RHS); |
513 | } |
514 | |
515 | } // end namespace std |
516 | |
517 | #endif // LLVM_ADT_SMALLPTRSET_H |
1 | //===- Attributor.h --- Module-wide attribute deduction ---------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Attributor: An inter procedural (abstract) "attribute" deduction framework. |
10 | // |
11 | // The Attributor framework is an inter procedural abstract analysis (fixpoint |
12 | // iteration analysis). The goal is to allow easy deduction of new attributes as |
13 | // well as information exchange between abstract attributes in-flight. |
14 | // |
15 | // The Attributor class is the driver and the link between the various abstract |
16 | // attributes. The Attributor will iterate until a fixpoint state is reached by |
17 | // all abstract attributes in-flight, or until it will enforce a pessimistic fix |
18 | // point because an iteration limit is reached. |
19 | // |
20 | // Abstract attributes, derived from the AbstractAttribute class, actually |
21 | // describe properties of the code. They can correspond to actual LLVM-IR |
22 | // attributes, or they can be more general, ultimately unrelated to LLVM-IR |
23 | // attributes. The latter is useful when an abstract attributes provides |
24 | // information to other abstract attributes in-flight but we might not want to |
25 | // manifest the information. The Attributor allows to query in-flight abstract |
26 | // attributes through the `Attributor::getAAFor` method (see the method |
27 | // description for an example). If the method is used by an abstract attribute |
28 | // P, and it results in an abstract attribute Q, the Attributor will |
29 | // automatically capture a potential dependence from Q to P. This dependence |
30 | // will cause P to be reevaluated whenever Q changes in the future. |
31 | // |
32 | // The Attributor will only reevaluate abstract attributes that might have |
33 | // changed since the last iteration. That means that the Attribute will not |
34 | // revisit all instructions/blocks/functions in the module but only query |
35 | // an update from a subset of the abstract attributes. |
36 | // |
37 | // The update method `AbstractAttribute::updateImpl` is implemented by the |
38 | // specific "abstract attribute" subclasses. The method is invoked whenever the |
39 | // currently assumed state (see the AbstractState class) might not be valid |
40 | // anymore. This can, for example, happen if the state was dependent on another |
41 | // abstract attribute that changed. In every invocation, the update method has |
42 | // to adjust the internal state of an abstract attribute to a point that is |
43 | // justifiable by the underlying IR and the current state of abstract attributes |
44 | // in-flight. Since the IR is given and assumed to be valid, the information |
45 | // derived from it can be assumed to hold. However, information derived from |
46 | // other abstract attributes is conditional on various things. If the justifying |
47 | // state changed, the `updateImpl` has to revisit the situation and potentially |
48 | // find another justification or limit the optimistic assumes made. |
49 | // |
50 | // Change is the key in this framework. Until a state of no-change, thus a |
51 | // fixpoint, is reached, the Attributor will query the abstract attributes |
52 | // in-flight to re-evaluate their state. If the (current) state is too |
53 | // optimistic, hence it cannot be justified anymore through other abstract |
54 | // attributes or the state of the IR, the state of the abstract attribute will |
55 | // have to change. Generally, we assume abstract attribute state to be a finite |
56 | // height lattice and the update function to be monotone. However, these |
57 | // conditions are not enforced because the iteration limit will guarantee |
58 | // termination. If an optimistic fixpoint is reached, or a pessimistic fix |
59 | // point is enforced after a timeout, the abstract attributes are tasked to |
60 | // manifest their result in the IR for passes to come. |
61 | // |
62 | // Attribute manifestation is not mandatory. If desired, there is support to |
63 | // generate a single or multiple LLVM-IR attributes already in the helper struct |
64 | // IRAttribute. In the simplest case, a subclass inherits from IRAttribute with |
65 | // a proper Attribute::AttrKind as template parameter. The Attributor |
66 | // manifestation framework will then create and place a new attribute if it is |
67 | // allowed to do so (based on the abstract state). Other use cases can be |
68 | // achieved by overloading AbstractAttribute or IRAttribute methods. |
69 | // |
70 | // |
71 | // The "mechanics" of adding a new "abstract attribute": |
72 | // - Define a class (transitively) inheriting from AbstractAttribute and one |
73 | // (which could be the same) that (transitively) inherits from AbstractState. |
74 | // For the latter, consider the already available BooleanState and |
75 | // {Inc,Dec,Bit}IntegerState if they fit your needs, e.g., you require only a |
76 | // number tracking or bit-encoding. |
77 | // - Implement all pure methods. Also use overloading if the attribute is not |
78 | // conforming with the "default" behavior: A (set of) LLVM-IR attribute(s) for |
79 | // an argument, call site argument, function return value, or function. See |
80 | // the class and method descriptions for more information on the two |
81 | // "Abstract" classes and their respective methods. |
82 | // - Register opportunities for the new abstract attribute in the |
83 | // `Attributor::identifyDefaultAbstractAttributes` method if it should be |
84 | // counted as a 'default' attribute. |
85 | // - Add sufficient tests. |
86 | // - Add a Statistics object for bookkeeping. If it is a simple (set of) |
87 | // attribute(s) manifested through the Attributor manifestation framework, see |
88 | // the bookkeeping function in Attributor.cpp. |
89 | // - If instructions with a certain opcode are interesting to the attribute, add |
90 | // that opcode to the switch in `Attributor::identifyAbstractAttributes`. This |
91 | // will make it possible to query all those instructions through the |
92 | // `InformationCache::getOpcodeInstMapForFunction` interface and eliminate the |
93 | // need to traverse the IR repeatedly. |
94 | // |
95 | //===----------------------------------------------------------------------===// |
96 | |
97 | #ifndef LLVM_TRANSFORMS_IPO_ATTRIBUTOR_H |
98 | #define LLVM_TRANSFORMS_IPO_ATTRIBUTOR_H |
99 | |
100 | #include "llvm/ADT/DenseSet.h" |
101 | #include "llvm/ADT/GraphTraits.h" |
102 | #include "llvm/ADT/MapVector.h" |
103 | #include "llvm/ADT/STLExtras.h" |
104 | #include "llvm/ADT/SetVector.h" |
105 | #include "llvm/ADT/Triple.h" |
106 | #include "llvm/ADT/iterator.h" |
107 | #include "llvm/Analysis/AssumeBundleQueries.h" |
108 | #include "llvm/Analysis/CFG.h" |
109 | #include "llvm/Analysis/CGSCCPassManager.h" |
110 | #include "llvm/Analysis/LazyCallGraph.h" |
111 | #include "llvm/Analysis/LoopInfo.h" |
112 | #include "llvm/Analysis/MustExecute.h" |
113 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
114 | #include "llvm/Analysis/PostDominators.h" |
115 | #include "llvm/Analysis/TargetLibraryInfo.h" |
116 | #include "llvm/IR/AbstractCallSite.h" |
117 | #include "llvm/IR/ConstantRange.h" |
118 | #include "llvm/IR/PassManager.h" |
119 | #include "llvm/Support/Allocator.h" |
120 | #include "llvm/Support/Casting.h" |
121 | #include "llvm/Support/GraphWriter.h" |
122 | #include "llvm/Support/TimeProfiler.h" |
123 | #include "llvm/Transforms/Utils/CallGraphUpdater.h" |
124 | |
125 | namespace llvm { |
126 | |
127 | struct AADepGraphNode; |
128 | struct AADepGraph; |
129 | struct Attributor; |
130 | struct AbstractAttribute; |
131 | struct InformationCache; |
132 | struct AAIsDead; |
133 | struct AttributorCallGraph; |
134 | |
135 | class AAManager; |
136 | class AAResults; |
137 | class Function; |
138 | |
139 | /// Abstract Attribute helper functions. |
140 | namespace AA { |
141 | |
142 | /// Return true if \p V is dynamically unique, that is, there are no two |
143 | /// "instances" of \p V at runtime with different values. |
144 | bool isDynamicallyUnique(Attributor &A, const AbstractAttribute &QueryingAA, |
145 | const Value &V); |
146 | |
147 | /// Return true if \p V is a valid value in \p Scope, that is a constant or an |
148 | /// instruction/argument of \p Scope. |
149 | bool isValidInScope(const Value &V, const Function *Scope); |
150 | |
151 | /// Return true if \p V is a valid value at position \p CtxI, that is a |
152 | /// constant, an argument of the same function as \p CtxI, or an instruction in |
153 | /// that function that dominates \p CtxI. |
154 | bool isValidAtPosition(const Value &V, const Instruction &CtxI, |
155 | InformationCache &InfoCache); |
156 | |
157 | /// Try to convert \p V to type \p Ty without introducing new instructions. If |
158 | /// this is not possible return `nullptr`. Note: this function basically knows |
159 | /// how to cast various constants. |
160 | Value *getWithType(Value &V, Type &Ty); |
161 | |
162 | /// Return the combination of \p A and \p B such that the result is a possible |
163 | /// value of both. \p B is potentially casted to match the type \p Ty or the |
164 | /// type of \p A if \p Ty is null. |
165 | /// |
166 | /// Examples: |
167 | /// X + none => X |
168 | /// not_none + undef => not_none |
169 | /// V1 + V2 => nullptr |
170 | Optional<Value *> |
171 | combineOptionalValuesInAAValueLatice(const Optional<Value *> &A, |
172 | const Optional<Value *> &B, Type *Ty); |
173 | |
174 | /// Return the initial value of \p Obj with type \p Ty if that is a constant. |
175 | Constant *getInitialValueForObj(Value &Obj, Type &Ty); |
176 | |
177 | /// Collect all potential underlying objects of \p Ptr at position \p CtxI in |
178 | /// \p Objects. Assumed information is used and dependences onto \p QueryingAA |
179 | /// are added appropriately. |
180 | /// |
181 | /// \returns True if \p Objects contains all assumed underlying objects, and |
182 | /// false if something went wrong and the objects could not be |
183 | /// determined. |
184 | bool getAssumedUnderlyingObjects(Attributor &A, const Value &Ptr, |
185 | SmallVectorImpl<Value *> &Objects, |
186 | const AbstractAttribute &QueryingAA, |
187 | const Instruction *CtxI); |
188 | |
189 | /// Collect all potential values of the one stored by \p SI into |
190 | /// \p PotentialCopies. That is, the only copies that were made via the |
191 | /// store are assumed to be known and all in \p PotentialCopies. Dependences |
192 | /// onto \p QueryingAA are properly tracked, \p UsedAssumedInformation will |
193 | /// inform the caller if assumed information was used. |
194 | /// |
195 | /// \returns True if the assumed potential copies are all in \p PotentialCopies, |
196 | /// false if something went wrong and the copies could not be |
197 | /// determined. |
198 | bool getPotentialCopiesOfStoredValue( |
199 | Attributor &A, StoreInst &SI, SmallSetVector<Value *, 4> &PotentialCopies, |
200 | const AbstractAttribute &QueryingAA, bool &UsedAssumedInformation); |
201 | |
202 | } // namespace AA |
203 | |
204 | /// The value passed to the line option that defines the maximal initialization |
205 | /// chain length. |
206 | extern unsigned MaxInitializationChainLength; |
207 | |
208 | ///{ |
209 | enum class ChangeStatus { |
210 | CHANGED, |
211 | UNCHANGED, |
212 | }; |
213 | |
214 | ChangeStatus operator|(ChangeStatus l, ChangeStatus r); |
215 | ChangeStatus &operator|=(ChangeStatus &l, ChangeStatus r); |
216 | ChangeStatus operator&(ChangeStatus l, ChangeStatus r); |
217 | ChangeStatus &operator&=(ChangeStatus &l, ChangeStatus r); |
218 | |
219 | enum class DepClassTy { |
220 | REQUIRED, ///< The target cannot be valid if the source is not. |
221 | OPTIONAL, ///< The target may be valid if the source is not. |
222 | NONE, ///< Do not track a dependence between source and target. |
223 | }; |
224 | ///} |
225 | |
226 | /// The data structure for the nodes of a dependency graph |
227 | struct AADepGraphNode { |
228 | public: |
229 | virtual ~AADepGraphNode(){}; |
230 | using DepTy = PointerIntPair<AADepGraphNode *, 1>; |
231 | |
232 | protected: |
233 | /// Set of dependency graph nodes which should be updated if this one |
234 | /// is updated. The bit encodes if it is optional. |
235 | TinyPtrVector<DepTy> Deps; |
236 | |
237 | static AADepGraphNode *DepGetVal(DepTy &DT) { return DT.getPointer(); } |
238 | static AbstractAttribute *DepGetValAA(DepTy &DT) { |
239 | return cast<AbstractAttribute>(DT.getPointer()); |
240 | } |
241 | |
242 | operator AbstractAttribute *() { return cast<AbstractAttribute>(this); } |
243 | |
244 | public: |
245 | using iterator = |
246 | mapped_iterator<TinyPtrVector<DepTy>::iterator, decltype(&DepGetVal)>; |
247 | using aaiterator = |
248 | mapped_iterator<TinyPtrVector<DepTy>::iterator, decltype(&DepGetValAA)>; |
249 | |
250 | aaiterator begin() { return aaiterator(Deps.begin(), &DepGetValAA); } |
251 | aaiterator end() { return aaiterator(Deps.end(), &DepGetValAA); } |
252 | iterator child_begin() { return iterator(Deps.begin(), &DepGetVal); } |
253 | iterator child_end() { return iterator(Deps.end(), &DepGetVal); } |
254 | |
255 | virtual void print(raw_ostream &OS) const { OS << "AADepNode Impl\n"; } |
256 | TinyPtrVector<DepTy> &getDeps() { return Deps; } |
257 | |
258 | friend struct Attributor; |
259 | friend struct AADepGraph; |
260 | }; |
261 | |
262 | /// The data structure for the dependency graph |
263 | /// |
264 | /// Note that in this graph if there is an edge from A to B (A -> B), |
265 | /// then it means that B depends on A, and when the state of A is |
266 | /// updated, node B should also be updated |
267 | struct AADepGraph { |
268 | AADepGraph() {} |
269 | ~AADepGraph() {} |
270 | |
271 | using DepTy = AADepGraphNode::DepTy; |
272 | static AADepGraphNode *DepGetVal(DepTy &DT) { return DT.getPointer(); } |
273 | using iterator = |
274 | mapped_iterator<TinyPtrVector<DepTy>::iterator, decltype(&DepGetVal)>; |
275 | |
276 | /// There is no root node for the dependency graph. But the SCCIterator |
277 | /// requires a single entry point, so we maintain a fake("synthetic") root |
278 | /// node that depends on every node. |
279 | AADepGraphNode SyntheticRoot; |
280 | AADepGraphNode *GetEntryNode() { return &SyntheticRoot; } |
281 | |
282 | iterator begin() { return SyntheticRoot.child_begin(); } |
283 | iterator end() { return SyntheticRoot.child_end(); } |
284 | |
285 | void viewGraph(); |
286 | |
287 | /// Dump graph to file |
288 | void dumpGraph(); |
289 | |
290 | /// Print dependency graph |
291 | void print(); |
292 | }; |
293 | |
294 | /// Helper to describe and deal with positions in the LLVM-IR. |
295 | /// |
296 | /// A position in the IR is described by an anchor value and an "offset" that |
297 | /// could be the argument number, for call sites and arguments, or an indicator |
298 | /// of the "position kind". The kinds, specified in the Kind enum below, include |
299 | /// the locations in the attribute list, i.a., function scope and return value, |
300 | /// as well as a distinction between call sites and functions. Finally, there |
301 | /// are floating values that do not have a corresponding attribute list |
302 | /// position. |
303 | struct IRPosition { |
304 | // NOTE: In the future this definition can be changed to support recursive |
305 | // functions. |
306 | using CallBaseContext = CallBase; |
307 | |
308 | /// The positions we distinguish in the IR. |
309 | enum Kind : char { |
310 | IRP_INVALID, ///< An invalid position. |
311 | IRP_FLOAT, ///< A position that is not associated with a spot suitable |
312 | ///< for attributes. This could be any value or instruction. |
313 | IRP_RETURNED, ///< An attribute for the function return value. |
314 | IRP_CALL_SITE_RETURNED, ///< An attribute for a call site return value. |
315 | IRP_FUNCTION, ///< An attribute for a function (scope). |
316 | IRP_CALL_SITE, ///< An attribute for a call site (function scope). |
317 | IRP_ARGUMENT, ///< An attribute for a function argument. |
318 | IRP_CALL_SITE_ARGUMENT, ///< An attribute for a call site argument. |
319 | }; |
320 | |
321 | /// Default constructor available to create invalid positions implicitly. All |
322 | /// other positions need to be created explicitly through the appropriate |
323 | /// static member function. |
324 | IRPosition() : Enc(nullptr, ENC_VALUE) { verify(); } |
325 | |
326 | /// Create a position describing the value of \p V. |
327 | static const IRPosition value(const Value &V, |
328 | const CallBaseContext *CBContext = nullptr) { |
329 | if (auto *Arg = dyn_cast<Argument>(&V)) |
330 | return IRPosition::argument(*Arg, CBContext); |
331 | if (auto *CB = dyn_cast<CallBase>(&V)) |
332 | return IRPosition::callsite_returned(*CB); |
333 | return IRPosition(const_cast<Value &>(V), IRP_FLOAT, CBContext); |
334 | } |
335 | |
336 | /// Create a position describing the function scope of \p F. |
337 | /// \p CBContext is used for call base specific analysis. |
338 | static const IRPosition function(const Function &F, |
339 | const CallBaseContext *CBContext = nullptr) { |
340 | return IRPosition(const_cast<Function &>(F), IRP_FUNCTION, CBContext); |
341 | } |
342 | |
343 | /// Create a position describing the returned value of \p F. |
344 | /// \p CBContext is used for call base specific analysis. |
345 | static const IRPosition returned(const Function &F, |
346 | const CallBaseContext *CBContext = nullptr) { |
347 | return IRPosition(const_cast<Function &>(F), IRP_RETURNED, CBContext); |
348 | } |
349 | |
350 | /// Create a position describing the argument \p Arg. |
351 | /// \p CBContext is used for call base specific analysis. |
352 | static const IRPosition argument(const Argument &Arg, |
353 | const CallBaseContext *CBContext = nullptr) { |
354 | return IRPosition(const_cast<Argument &>(Arg), IRP_ARGUMENT, CBContext); |
355 | } |
356 | |
357 | /// Create a position describing the function scope of \p CB. |
358 | static const IRPosition callsite_function(const CallBase &CB) { |
359 | return IRPosition(const_cast<CallBase &>(CB), IRP_CALL_SITE); |
360 | } |
361 | |
362 | /// Create a position describing the returned value of \p CB. |
363 | static const IRPosition callsite_returned(const CallBase &CB) { |
364 | return IRPosition(const_cast<CallBase &>(CB), IRP_CALL_SITE_RETURNED); |
365 | } |
366 | |
367 | /// Create a position describing the argument of \p CB at position \p ArgNo. |
368 | static const IRPosition callsite_argument(const CallBase &CB, |
369 | unsigned ArgNo) { |
370 | return IRPosition(const_cast<Use &>(CB.getArgOperandUse(ArgNo)), |
371 | IRP_CALL_SITE_ARGUMENT); |
372 | } |
373 | |
374 | /// Create a position describing the argument of \p ACS at position \p ArgNo. |
375 | static const IRPosition callsite_argument(AbstractCallSite ACS, |
376 | unsigned ArgNo) { |
377 | if (ACS.getNumArgOperands() <= ArgNo) |
378 | return IRPosition(); |
379 | int CSArgNo = ACS.getCallArgOperandNo(ArgNo); |
380 | if (CSArgNo >= 0) |
381 | return IRPosition::callsite_argument( |
382 | cast<CallBase>(*ACS.getInstruction()), CSArgNo); |
383 | return IRPosition(); |
384 | } |
385 | |
386 | /// Create a position with function scope matching the "context" of \p IRP. |
387 | /// If \p IRP is a call site (see isAnyCallSitePosition()) then the result |
388 | /// will be a call site position, otherwise the function position of the |
389 | /// associated function. |
390 | static const IRPosition |
391 | function_scope(const IRPosition &IRP, |
392 | const CallBaseContext *CBContext = nullptr) { |
393 | if (IRP.isAnyCallSitePosition()) { |
394 | return IRPosition::callsite_function( |
395 | cast<CallBase>(IRP.getAnchorValue())); |
396 | } |
397 | assert(IRP.getAssociatedFunction())((void)0); |
398 | return IRPosition::function(*IRP.getAssociatedFunction(), CBContext); |
399 | } |
400 | |
401 | bool operator==(const IRPosition &RHS) const { |
402 | return Enc == RHS.Enc && RHS.CBContext == CBContext; |
403 | } |
404 | bool operator!=(const IRPosition &RHS) const { return !(*this == RHS); } |
405 | |
406 | /// Return the value this abstract attribute is anchored with. |
407 | /// |
408 | /// The anchor value might not be the associated value if the latter is not |
409 | /// sufficient to determine where arguments will be manifested. This is, so |
410 | /// far, only the case for call site arguments as the value is not sufficient |
411 | /// to pinpoint them. Instead, we can use the call site as an anchor. |
412 | Value &getAnchorValue() const { |
413 | switch (getEncodingBits()) { |
414 | case ENC_VALUE: |
415 | case ENC_RETURNED_VALUE: |
416 | case ENC_FLOATING_FUNCTION: |
417 | return *getAsValuePtr(); |
418 | case ENC_CALL_SITE_ARGUMENT_USE: |
419 | return *(getAsUsePtr()->getUser()); |
420 | default: |
421 | llvm_unreachable("Unkown encoding!")__builtin_unreachable(); |
422 | }; |
423 | } |
424 | |
425 | /// Return the associated function, if any. |
426 | Function *getAssociatedFunction() const { |
427 | if (auto *CB = dyn_cast<CallBase>(&getAnchorValue())) { |
428 | // We reuse the logic that associates callback calles to arguments of a |
429 | // call site here to identify the callback callee as the associated |
430 | // function. |
431 | if (Argument *Arg = getAssociatedArgument()) |
432 | return Arg->getParent(); |
433 | return CB->getCalledFunction(); |
434 | } |
435 | return getAnchorScope(); |
436 | } |
437 | |
438 | /// Return the associated argument, if any. |
439 | Argument *getAssociatedArgument() const; |
440 | |
441 | /// Return true if the position refers to a function interface, that is the |
442 | /// function scope, the function return, or an argument. |
443 | bool isFnInterfaceKind() const { |
444 | switch (getPositionKind()) { |
445 | case IRPosition::IRP_FUNCTION: |
446 | case IRPosition::IRP_RETURNED: |
447 | case IRPosition::IRP_ARGUMENT: |
448 | return true; |
449 | default: |
450 | return false; |
451 | } |
452 | } |
453 | |
454 | /// Return the Function surrounding the anchor value. |
455 | Function *getAnchorScope() const { |
456 | Value &V = getAnchorValue(); |
457 | if (isa<Function>(V)) |
458 | return &cast<Function>(V); |
459 | if (isa<Argument>(V)) |
460 | return cast<Argument>(V).getParent(); |
461 | if (isa<Instruction>(V)) |
462 | return cast<Instruction>(V).getFunction(); |
463 | return nullptr; |
464 | } |
465 | |
466 | /// Return the context instruction, if any. |
467 | Instruction *getCtxI() const { |
468 | Value &V = getAnchorValue(); |
469 | if (auto *I = dyn_cast<Instruction>(&V)) |
470 | return I; |
471 | if (auto *Arg = dyn_cast<Argument>(&V)) |
472 | if (!Arg->getParent()->isDeclaration()) |
473 | return &Arg->getParent()->getEntryBlock().front(); |
474 | if (auto *F = dyn_cast<Function>(&V)) |
475 | if (!F->isDeclaration()) |
476 | return &(F->getEntryBlock().front()); |
477 | return nullptr; |
478 | } |
479 | |
480 | /// Return the value this abstract attribute is associated with. |
481 | Value &getAssociatedValue() const { |
482 | if (getCallSiteArgNo() < 0 || isa<Argument>(&getAnchorValue())) |
483 | return getAnchorValue(); |
484 | assert(isa<CallBase>(&getAnchorValue()) && "Expected a call base!")((void)0); |
485 | return *cast<CallBase>(&getAnchorValue()) |
486 | ->getArgOperand(getCallSiteArgNo()); |
487 | } |
488 | |
489 | /// Return the type this abstract attribute is associated with. |
490 | Type *getAssociatedType() const { |
491 | if (getPositionKind() == IRPosition::IRP_RETURNED) |
492 | return getAssociatedFunction()->getReturnType(); |
493 | return getAssociatedValue().getType(); |
494 | } |
495 | |
496 | /// Return the callee argument number of the associated value if it is an |
497 | /// argument or call site argument, otherwise a negative value. In contrast to |
498 | /// `getCallSiteArgNo` this method will always return the "argument number" |
499 | /// from the perspective of the callee. This may not the same as the call site |
500 | /// if this is a callback call. |
501 | int getCalleeArgNo() const { |
502 | return getArgNo(/* CallbackCalleeArgIfApplicable */ true); |
503 | } |
504 | |
505 | /// Return the call site argument number of the associated value if it is an |
506 | /// argument or call site argument, otherwise a negative value. In contrast to |
507 | /// `getCalleArgNo` this method will always return the "operand number" from |
508 | /// the perspective of the call site. This may not the same as the callee |
509 | /// perspective if this is a callback call. |
510 | int getCallSiteArgNo() const { |
511 | return getArgNo(/* CallbackCalleeArgIfApplicable */ false); |
512 | } |
513 | |
514 | /// Return the index in the attribute list for this position. |
515 | unsigned getAttrIdx() const { |
516 | switch (getPositionKind()) { |
517 | case IRPosition::IRP_INVALID: |
518 | case IRPosition::IRP_FLOAT: |
519 | break; |
520 | case IRPosition::IRP_FUNCTION: |
521 | case IRPosition::IRP_CALL_SITE: |
522 | return AttributeList::FunctionIndex; |
523 | case IRPosition::IRP_RETURNED: |
524 | case IRPosition::IRP_CALL_SITE_RETURNED: |
525 | return AttributeList::ReturnIndex; |
526 | case IRPosition::IRP_ARGUMENT: |
527 | case IRPosition::IRP_CALL_SITE_ARGUMENT: |
528 | return getCallSiteArgNo() + AttributeList::FirstArgIndex; |
529 | } |
530 | llvm_unreachable(__builtin_unreachable() |
531 | "There is no attribute index for a floating or invalid position!")__builtin_unreachable(); |
532 | } |
533 | |
534 | /// Return the associated position kind. |
535 | Kind getPositionKind() const { |
536 | char EncodingBits = getEncodingBits(); |
537 | if (EncodingBits == ENC_CALL_SITE_ARGUMENT_USE) |
538 | return IRP_CALL_SITE_ARGUMENT; |
539 | if (EncodingBits == ENC_FLOATING_FUNCTION) |
540 | return IRP_FLOAT; |
541 | |
542 | Value *V = getAsValuePtr(); |
543 | if (!V) |
544 | return IRP_INVALID; |
545 | if (isa<Argument>(V)) |
546 | return IRP_ARGUMENT; |
547 | if (isa<Function>(V)) |
548 | return isReturnPosition(EncodingBits) ? IRP_RETURNED : IRP_FUNCTION; |
549 | if (isa<CallBase>(V)) |
550 | return isReturnPosition(EncodingBits) ? IRP_CALL_SITE_RETURNED |
551 | : IRP_CALL_SITE; |
552 | return IRP_FLOAT; |
553 | } |
554 | |
555 | /// TODO: Figure out if the attribute related helper functions should live |
556 | /// here or somewhere else. |
557 | |
558 | /// Return true if any kind in \p AKs existing in the IR at a position that |
559 | /// will affect this one. See also getAttrs(...). |
560 | /// \param IgnoreSubsumingPositions Flag to determine if subsuming positions, |
561 | /// e.g., the function position if this is an |
562 | /// argument position, should be ignored. |
563 | bool hasAttr(ArrayRef<Attribute::AttrKind> AKs, |
564 | bool IgnoreSubsumingPositions = false, |
565 | Attributor *A = nullptr) const; |
566 | |
567 | /// Return the attributes of any kind in \p AKs existing in the IR at a |
568 | /// position that will affect this one. While each position can only have a |
569 | /// single attribute of any kind in \p AKs, there are "subsuming" positions |
570 | /// that could have an attribute as well. This method returns all attributes |
571 | /// found in \p Attrs. |
572 | /// \param IgnoreSubsumingPositions Flag to determine if subsuming positions, |
573 | /// e.g., the function position if this is an |
574 | /// argument position, should be ignored. |
575 | void getAttrs(ArrayRef<Attribute::AttrKind> AKs, |
576 | SmallVectorImpl<Attribute> &Attrs, |
577 | bool IgnoreSubsumingPositions = false, |
578 | Attributor *A = nullptr) const; |
579 | |
580 | /// Remove the attribute of kind \p AKs existing in the IR at this position. |
581 | void removeAttrs(ArrayRef<Attribute::AttrKind> AKs) const { |
582 | if (getPositionKind() == IRP_INVALID || getPositionKind() == IRP_FLOAT) |
583 | return; |
584 | |
585 | AttributeList AttrList; |
586 | auto *CB = dyn_cast<CallBase>(&getAnchorValue()); |
587 | if (CB) |
588 | AttrList = CB->getAttributes(); |
589 | else |
590 | AttrList = getAssociatedFunction()->getAttributes(); |
591 | |
592 | LLVMContext &Ctx = getAnchorValue().getContext(); |
593 | for (Attribute::AttrKind AK : AKs) |
594 | AttrList = AttrList.removeAttribute(Ctx, getAttrIdx(), AK); |
595 | |
596 | if (CB) |
597 | CB->setAttributes(AttrList); |
598 | else |
599 | getAssociatedFunction()->setAttributes(AttrList); |
600 | } |
601 | |
602 | bool isAnyCallSitePosition() const { |
603 | switch (getPositionKind()) { |
604 | case IRPosition::IRP_CALL_SITE: |
605 | case IRPosition::IRP_CALL_SITE_RETURNED: |
606 | case IRPosition::IRP_CALL_SITE_ARGUMENT: |
607 | return true; |
608 | default: |
609 | return false; |
610 | } |
611 | } |
612 | |
613 | /// Return true if the position is an argument or call site argument. |
614 | bool isArgumentPosition() const { |
615 | switch (getPositionKind()) { |
616 | case IRPosition::IRP_ARGUMENT: |
617 | case IRPosition::IRP_CALL_SITE_ARGUMENT: |
618 | return true; |
619 | default: |
620 | return false; |
621 | } |
622 | } |
623 | |
624 | /// Return the same position without the call base context. |
625 | IRPosition stripCallBaseContext() const { |
626 | IRPosition Result = *this; |
627 | Result.CBContext = nullptr; |
628 | return Result; |
629 | } |
630 | |
631 | /// Get the call base context from the position. |
632 | const CallBaseContext *getCallBaseContext() const { return CBContext; } |
633 | |
634 | /// Check if the position has any call base context. |
635 | bool hasCallBaseContext() const { return CBContext != nullptr; } |
636 | |
637 | /// Special DenseMap key values. |
638 | /// |
639 | ///{ |
640 | static const IRPosition EmptyKey; |
641 | static const IRPosition TombstoneKey; |
642 | ///} |
643 | |
644 | /// Conversion into a void * to allow reuse of pointer hashing. |
645 | operator void *() const { return Enc.getOpaqueValue(); } |
646 | |
647 | private: |
648 | /// Private constructor for special values only! |
649 | explicit IRPosition(void *Ptr, const CallBaseContext *CBContext = nullptr) |
650 | : CBContext(CBContext) { |
651 | Enc.setFromOpaqueValue(Ptr); |
652 | } |
653 | |
654 | /// IRPosition anchored at \p AnchorVal with kind/argument numbet \p PK. |
655 | explicit IRPosition(Value &AnchorVal, Kind PK, |
656 | const CallBaseContext *CBContext = nullptr) |
657 | : CBContext(CBContext) { |
658 | switch (PK) { |
659 | case IRPosition::IRP_INVALID: |
660 | llvm_unreachable("Cannot create invalid IRP with an anchor value!")__builtin_unreachable(); |
661 | break; |
662 | case IRPosition::IRP_FLOAT: |
663 | // Special case for floating functions. |
664 | if (isa<Function>(AnchorVal)) |
665 | Enc = {&AnchorVal, ENC_FLOATING_FUNCTION}; |
666 | else |
667 | Enc = {&AnchorVal, ENC_VALUE}; |
668 | break; |
669 | case IRPosition::IRP_FUNCTION: |
670 | case IRPosition::IRP_CALL_SITE: |
671 | Enc = {&AnchorVal, ENC_VALUE}; |
672 | break; |
673 | case IRPosition::IRP_RETURNED: |
674 | case IRPosition::IRP_CALL_SITE_RETURNED: |
675 | Enc = {&AnchorVal, ENC_RETURNED_VALUE}; |
676 | break; |
677 | case IRPosition::IRP_ARGUMENT: |
678 | Enc = {&AnchorVal, ENC_VALUE}; |
679 | break; |
680 | case IRPosition::IRP_CALL_SITE_ARGUMENT: |
681 | llvm_unreachable(__builtin_unreachable() |
682 | "Cannot create call site argument IRP with an anchor value!")__builtin_unreachable(); |
683 | break; |
684 | } |
685 | verify(); |
686 | } |
687 | |
688 | /// Return the callee argument number of the associated value if it is an |
689 | /// argument or call site argument. See also `getCalleeArgNo` and |
690 | /// `getCallSiteArgNo`. |
691 | int getArgNo(bool CallbackCalleeArgIfApplicable) const { |
692 | if (CallbackCalleeArgIfApplicable) |
693 | if (Argument *Arg = getAssociatedArgument()) |
694 | return Arg->getArgNo(); |
695 | switch (getPositionKind()) { |
696 | case IRPosition::IRP_ARGUMENT: |
697 | return cast<Argument>(getAsValuePtr())->getArgNo(); |
698 | case IRPosition::IRP_CALL_SITE_ARGUMENT: { |
699 | Use &U = *getAsUsePtr(); |
700 | return cast<CallBase>(U.getUser())->getArgOperandNo(&U); |
701 | } |
702 | default: |
703 | return -1; |
704 | } |
705 | } |
706 | |
707 | /// IRPosition for the use \p U. The position kind \p PK needs to be |
708 | /// IRP_CALL_SITE_ARGUMENT, the anchor value is the user, the associated value |
709 | /// the used value. |
710 | explicit IRPosition(Use &U, Kind PK) { |
711 | assert(PK == IRP_CALL_SITE_ARGUMENT &&((void)0) |
712 | "Use constructor is for call site arguments only!")((void)0); |
713 | Enc = {&U, ENC_CALL_SITE_ARGUMENT_USE}; |
714 | verify(); |
715 | } |
716 | |
717 | /// Verify internal invariants. |
718 | void verify(); |
719 | |
720 | /// Return the attributes of kind \p AK existing in the IR as attribute. |
721 | bool getAttrsFromIRAttr(Attribute::AttrKind AK, |
722 | SmallVectorImpl<Attribute> &Attrs) const; |
723 | |
724 | /// Return the attributes of kind \p AK existing in the IR as operand bundles |
725 | /// of an llvm.assume. |
726 | bool getAttrsFromAssumes(Attribute::AttrKind AK, |
727 | SmallVectorImpl<Attribute> &Attrs, |
728 | Attributor &A) const; |
729 | |
730 | /// Return the underlying pointer as Value *, valid for all positions but |
731 | /// IRP_CALL_SITE_ARGUMENT. |
732 | Value *getAsValuePtr() const { |
733 | assert(getEncodingBits() != ENC_CALL_SITE_ARGUMENT_USE &&((void)0) |
734 | "Not a value pointer!")((void)0); |
735 | return reinterpret_cast<Value *>(Enc.getPointer()); |
736 | } |
737 | |
738 | /// Return the underlying pointer as Use *, valid only for |
739 | /// IRP_CALL_SITE_ARGUMENT positions. |
740 | Use *getAsUsePtr() const { |
741 | assert(getEncodingBits() == ENC_CALL_SITE_ARGUMENT_USE &&((void)0) |
742 | "Not a value pointer!")((void)0); |
743 | return reinterpret_cast<Use *>(Enc.getPointer()); |
744 | } |
745 | |
746 | /// Return true if \p EncodingBits describe a returned or call site returned |
747 | /// position. |
748 | static bool isReturnPosition(char EncodingBits) { |
749 | return EncodingBits == ENC_RETURNED_VALUE; |
750 | } |
751 | |
752 | /// Return true if the encoding bits describe a returned or call site returned |
753 | /// position. |
754 | bool isReturnPosition() const { return isReturnPosition(getEncodingBits()); } |
755 | |
756 | /// The encoding of the IRPosition is a combination of a pointer and two |
757 | /// encoding bits. The values of the encoding bits are defined in the enum |
758 | /// below. The pointer is either a Value* (for the first three encoding bit |
759 | /// combinations) or Use* (for ENC_CALL_SITE_ARGUMENT_USE). |
760 | /// |
761 | ///{ |
762 | enum { |
763 | ENC_VALUE = 0b00, |
764 | ENC_RETURNED_VALUE = 0b01, |
765 | ENC_FLOATING_FUNCTION = 0b10, |
766 | ENC_CALL_SITE_ARGUMENT_USE = 0b11, |
767 | }; |
768 | |
769 | // Reserve the maximal amount of bits so there is no need to mask out the |
770 | // remaining ones. We will not encode anything else in the pointer anyway. |
771 | static constexpr int NumEncodingBits = |
772 | PointerLikeTypeTraits<void *>::NumLowBitsAvailable; |
773 | static_assert(NumEncodingBits >= 2, "At least two bits are required!"); |
774 | |
775 | /// The pointer with the encoding bits. |
776 | PointerIntPair<void *, NumEncodingBits, char> Enc; |
777 | ///} |
778 | |
779 | /// Call base context. Used for callsite specific analysis. |
780 | const CallBaseContext *CBContext = nullptr; |
781 | |
782 | /// Return the encoding bits. |
783 | char getEncodingBits() const { return Enc.getInt(); } |
784 | }; |
785 | |
786 | /// Helper that allows IRPosition as a key in a DenseMap. |
787 | template <> struct DenseMapInfo<IRPosition> { |
788 | static inline IRPosition getEmptyKey() { return IRPosition::EmptyKey; } |
789 | static inline IRPosition getTombstoneKey() { |
790 | return IRPosition::TombstoneKey; |
791 | } |
792 | static unsigned getHashValue(const IRPosition &IRP) { |
793 | return (DenseMapInfo<void *>::getHashValue(IRP) << 4) ^ |
794 | (DenseMapInfo<Value *>::getHashValue(IRP.getCallBaseContext())); |
795 | } |
796 | |
797 | static bool isEqual(const IRPosition &a, const IRPosition &b) { |
798 | return a == b; |
799 | } |
800 | }; |
801 | |
802 | /// A visitor class for IR positions. |
803 | /// |
804 | /// Given a position P, the SubsumingPositionIterator allows to visit "subsuming |
805 | /// positions" wrt. attributes/information. Thus, if a piece of information |
806 | /// holds for a subsuming position, it also holds for the position P. |
807 | /// |
808 | /// The subsuming positions always include the initial position and then, |
809 | /// depending on the position kind, additionally the following ones: |
810 | /// - for IRP_RETURNED: |
811 | /// - the function (IRP_FUNCTION) |
812 | /// - for IRP_ARGUMENT: |
813 | /// - the function (IRP_FUNCTION) |
814 | /// - for IRP_CALL_SITE: |
815 | /// - the callee (IRP_FUNCTION), if known |
816 | /// - for IRP_CALL_SITE_RETURNED: |
817 | /// - the callee (IRP_RETURNED), if known |
818 | /// - the call site (IRP_FUNCTION) |
819 | /// - the callee (IRP_FUNCTION), if known |
820 | /// - for IRP_CALL_SITE_ARGUMENT: |
821 | /// - the argument of the callee (IRP_ARGUMENT), if known |
822 | /// - the callee (IRP_FUNCTION), if known |
823 | /// - the position the call site argument is associated with if it is not |
824 | /// anchored to the call site, e.g., if it is an argument then the argument |
825 | /// (IRP_ARGUMENT) |
826 | class SubsumingPositionIterator { |
827 | SmallVector<IRPosition, 4> IRPositions; |
828 | using iterator = decltype(IRPositions)::iterator; |
829 | |
830 | public: |
831 | SubsumingPositionIterator(const IRPosition &IRP); |
832 | iterator begin() { return IRPositions.begin(); } |
833 | iterator end() { return IRPositions.end(); } |
834 | }; |
835 | |
836 | /// Wrapper for FunctoinAnalysisManager. |
837 | struct AnalysisGetter { |
838 | template <typename Analysis> |
839 | typename Analysis::Result *getAnalysis(const Function &F) { |
840 | if (!FAM || !F.getParent()) |
841 | return nullptr; |
842 | return &FAM->getResult<Analysis>(const_cast<Function &>(F)); |
843 | } |
844 | |
845 | AnalysisGetter(FunctionAnalysisManager &FAM) : FAM(&FAM) {} |
846 | AnalysisGetter() {} |
847 | |
848 | private: |
849 | FunctionAnalysisManager *FAM = nullptr; |
850 | }; |
851 | |
852 | /// Data structure to hold cached (LLVM-IR) information. |
853 | /// |
854 | /// All attributes are given an InformationCache object at creation time to |
855 | /// avoid inspection of the IR by all of them individually. This default |
856 | /// InformationCache will hold information required by 'default' attributes, |
857 | /// thus the ones deduced when Attributor::identifyDefaultAbstractAttributes(..) |
858 | /// is called. |
859 | /// |
860 | /// If custom abstract attributes, registered manually through |
861 | /// Attributor::registerAA(...), need more information, especially if it is not |
862 | /// reusable, it is advised to inherit from the InformationCache and cast the |
863 | /// instance down in the abstract attributes. |
864 | struct InformationCache { |
865 | InformationCache(const Module &M, AnalysisGetter &AG, |
866 | BumpPtrAllocator &Allocator, SetVector<Function *> *CGSCC) |
867 | : DL(M.getDataLayout()), Allocator(Allocator), |
868 | Explorer( |
869 | /* ExploreInterBlock */ true, /* ExploreCFGForward */ true, |
870 | /* ExploreCFGBackward */ true, |
871 | /* LIGetter */ |
872 | [&](const Function &F) { return AG.getAnalysis<LoopAnalysis>(F); }, |
873 | /* DTGetter */ |
874 | [&](const Function &F) { |
875 | return AG.getAnalysis<DominatorTreeAnalysis>(F); |
876 | }, |
877 | /* PDTGetter */ |
878 | [&](const Function &F) { |
879 | return AG.getAnalysis<PostDominatorTreeAnalysis>(F); |
880 | }), |
881 | AG(AG), CGSCC(CGSCC), TargetTriple(M.getTargetTriple()) { |
882 | if (CGSCC) |
883 | initializeModuleSlice(*CGSCC); |
884 | } |
885 | |
886 | ~InformationCache() { |
887 | // The FunctionInfo objects are allocated via a BumpPtrAllocator, we call |
888 | // the destructor manually. |
889 | for (auto &It : FuncInfoMap) |
890 | It.getSecond()->~FunctionInfo(); |
891 | } |
892 | |
893 | /// Apply \p CB to all uses of \p F. If \p LookThroughConstantExprUses is |
894 | /// true, constant expression users are not given to \p CB but their uses are |
895 | /// traversed transitively. |
896 | template <typename CBTy> |
897 | static void foreachUse(Function &F, CBTy CB, |
898 | bool LookThroughConstantExprUses = true) { |
899 | SmallVector<Use *, 8> Worklist(make_pointer_range(F.uses())); |
900 | |
901 | for (unsigned Idx = 0; Idx < Worklist.size(); ++Idx) { |
902 | Use &U = *Worklist[Idx]; |
903 | |
904 | // Allow use in constant bitcasts and simply look through them. |
905 | if (LookThroughConstantExprUses && isa<ConstantExpr>(U.getUser())) { |
906 | for (Use &CEU : cast<ConstantExpr>(U.getUser())->uses()) |
907 | Worklist.push_back(&CEU); |
908 | continue; |
909 | } |
910 | |
911 | CB(U); |
912 | } |
913 | } |
914 | |
915 | /// Initialize the ModuleSlice member based on \p SCC. ModuleSlices contains |
916 | /// (a subset of) all functions that we can look at during this SCC traversal. |
917 | /// This includes functions (transitively) called from the SCC and the |
918 | /// (transitive) callers of SCC functions. We also can look at a function if |
919 | /// there is a "reference edge", i.a., if the function somehow uses (!=calls) |
920 | /// a function in the SCC or a caller of a function in the SCC. |
921 | void initializeModuleSlice(SetVector<Function *> &SCC) { |
922 | ModuleSlice.insert(SCC.begin(), SCC.end()); |
923 | |
924 | SmallPtrSet<Function *, 16> Seen; |
925 | SmallVector<Function *, 16> Worklist(SCC.begin(), SCC.end()); |
926 | while (!Worklist.empty()) { |
927 | Function *F = Worklist.pop_back_val(); |
928 | ModuleSlice.insert(F); |
929 | |
930 | for (Instruction &I : instructions(*F)) |
931 | if (auto *CB = dyn_cast<CallBase>(&I)) |
932 | if (Function *Callee = CB->getCalledFunction()) |
933 | if (Seen.insert(Callee).second) |
934 | Worklist.push_back(Callee); |
935 | } |
936 | |
937 | Seen.clear(); |
938 | Worklist.append(SCC.begin(), SCC.end()); |
939 | while (!Worklist.empty()) { |
940 | Function *F = Worklist.pop_back_val(); |
941 | ModuleSlice.insert(F); |
942 | |
943 | // Traverse all transitive uses. |
944 | foreachUse(*F, [&](Use &U) { |
945 | if (auto *UsrI = dyn_cast<Instruction>(U.getUser())) |
946 | if (Seen.insert(UsrI->getFunction()).second) |
947 | Worklist.push_back(UsrI->getFunction()); |
948 | }); |
949 | } |
950 | } |
951 | |
952 | /// The slice of the module we are allowed to look at. |
953 | SmallPtrSet<Function *, 8> ModuleSlice; |
954 | |
955 | /// A vector type to hold instructions. |
956 | using InstructionVectorTy = SmallVector<Instruction *, 8>; |
957 | |
958 | /// A map type from opcodes to instructions with this opcode. |
959 | using OpcodeInstMapTy = DenseMap<unsigned, InstructionVectorTy *>; |
960 | |
961 | /// Return the map that relates "interesting" opcodes with all instructions |
962 | /// with that opcode in \p F. |
963 | OpcodeInstMapTy &getOpcodeInstMapForFunction(const Function &F) { |
964 | return getFunctionInfo(F).OpcodeInstMap; |
965 | } |
966 | |
967 | /// Return the instructions in \p F that may read or write memory. |
968 | InstructionVectorTy &getReadOrWriteInstsForFunction(const Function &F) { |
969 | return getFunctionInfo(F).RWInsts; |
970 | } |
971 | |
972 | /// Return MustBeExecutedContextExplorer |
973 | MustBeExecutedContextExplorer &getMustBeExecutedContextExplorer() { |
974 | return Explorer; |
975 | } |
976 | |
977 | /// Return TargetLibraryInfo for function \p F. |
978 | TargetLibraryInfo *getTargetLibraryInfoForFunction(const Function &F) { |
979 | return AG.getAnalysis<TargetLibraryAnalysis>(F); |
980 | } |
981 | |
982 | /// Return AliasAnalysis Result for function \p F. |
983 | AAResults *getAAResultsForFunction(const Function &F); |
984 | |
985 | /// Return true if \p Arg is involved in a must-tail call, thus the argument |
986 | /// of the caller or callee. |
987 | bool isInvolvedInMustTailCall(const Argument &Arg) { |
988 | FunctionInfo &FI = getFunctionInfo(*Arg.getParent()); |
989 | return FI.CalledViaMustTail || FI.ContainsMustTailCall; |
990 | } |
991 | |
992 | /// Return the analysis result from a pass \p AP for function \p F. |
993 | template <typename AP> |
994 | typename AP::Result *getAnalysisResultForFunction(const Function &F) { |
995 | return AG.getAnalysis<AP>(F); |
996 | } |
997 | |
998 | /// Return SCC size on call graph for function \p F or 0 if unknown. |
999 | unsigned getSccSize(const Function &F) { |
1000 | if (CGSCC && CGSCC->count(const_cast<Function *>(&F))) |
1001 | return CGSCC->size(); |
1002 | return 0; |
1003 | } |
1004 | |
1005 | /// Return datalayout used in the module. |
1006 | const DataLayout &getDL() { return DL; } |
1007 | |
1008 | /// Return the map conaining all the knowledge we have from `llvm.assume`s. |
1009 | const RetainedKnowledgeMap &getKnowledgeMap() const { return KnowledgeMap; } |
1010 | |
1011 | /// Return if \p To is potentially reachable form \p From or not |
1012 | /// If the same query was answered, return cached result |
1013 | bool getPotentiallyReachable(const Instruction &From, const Instruction &To) { |
1014 | auto KeyPair = std::make_pair(&From, &To); |
1015 | auto Iter = PotentiallyReachableMap.find(KeyPair); |
1016 | if (Iter != PotentiallyReachableMap.end()) |
1017 | return Iter->second; |
1018 | const Function &F = *From.getFunction(); |
1019 | bool Result = true; |
1020 | if (From.getFunction() == To.getFunction()) |
1021 | Result = isPotentiallyReachable(&From, &To, nullptr, |
1022 | AG.getAnalysis<DominatorTreeAnalysis>(F), |
1023 | AG.getAnalysis<LoopAnalysis>(F)); |
1024 | PotentiallyReachableMap.insert(std::make_pair(KeyPair, Result)); |
1025 | return Result; |
1026 | } |
1027 | |
1028 | /// Check whether \p F is part of module slice. |
1029 | bool isInModuleSlice(const Function &F) { |
1030 | return ModuleSlice.count(const_cast<Function *>(&F)); |
1031 | } |
1032 | |
1033 | /// Return true if the stack (llvm::Alloca) can be accessed by other threads. |
1034 | bool stackIsAccessibleByOtherThreads() { return !targetIsGPU(); } |
1035 | |
1036 | /// Return true if the target is a GPU. |
1037 | bool targetIsGPU() { |
1038 | return TargetTriple.isAMDGPU() || TargetTriple.isNVPTX(); |
1039 | } |
1040 | |
1041 | private: |
1042 | struct FunctionInfo { |
1043 | ~FunctionInfo(); |
1044 | |
1045 | /// A nested map that remembers all instructions in a function with a |
1046 | /// certain instruction opcode (Instruction::getOpcode()). |
1047 | OpcodeInstMapTy OpcodeInstMap; |
1048 | |
1049 | /// A map from functions to their instructions that may read or write |
1050 | /// memory. |
1051 | InstructionVectorTy RWInsts; |
1052 | |
1053 | /// Function is called by a `musttail` call. |
1054 | bool CalledViaMustTail; |
1055 |