| File: | src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |
| Warning: | line 118, column 5 Value stored to 'Ctor' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file |
| 10 | /// The AMDGPU target machine contains all of the hardware specific |
| 11 | /// information needed to emit code for R600 and SI GPUs. |
| 12 | // |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #include "AMDGPUTargetMachine.h" |
| 16 | #include "AMDGPU.h" |
| 17 | #include "AMDGPUAliasAnalysis.h" |
| 18 | #include "AMDGPUExportClustering.h" |
| 19 | #include "AMDGPUMacroFusion.h" |
| 20 | #include "AMDGPUTargetObjectFile.h" |
| 21 | #include "AMDGPUTargetTransformInfo.h" |
| 22 | #include "GCNIterativeScheduler.h" |
| 23 | #include "GCNSchedStrategy.h" |
| 24 | #include "R600MachineScheduler.h" |
| 25 | #include "SIMachineFunctionInfo.h" |
| 26 | #include "SIMachineScheduler.h" |
| 27 | #include "TargetInfo/AMDGPUTargetInfo.h" |
| 28 | #include "llvm/Analysis/CGSCCPassManager.h" |
| 29 | #include "llvm/CodeGen/GlobalISel/IRTranslator.h" |
| 30 | #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" |
| 31 | #include "llvm/CodeGen/GlobalISel/Legalizer.h" |
| 32 | #include "llvm/CodeGen/GlobalISel/Localizer.h" |
| 33 | #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" |
| 34 | #include "llvm/CodeGen/MIRParser/MIParser.h" |
| 35 | #include "llvm/CodeGen/Passes.h" |
| 36 | #include "llvm/CodeGen/RegAllocRegistry.h" |
| 37 | #include "llvm/CodeGen/TargetPassConfig.h" |
| 38 | #include "llvm/IR/LegacyPassManager.h" |
| 39 | #include "llvm/IR/PassManager.h" |
| 40 | #include "llvm/InitializePasses.h" |
| 41 | #include "llvm/Passes/PassBuilder.h" |
| 42 | #include "llvm/Support/TargetRegistry.h" |
| 43 | #include "llvm/Transforms/IPO.h" |
| 44 | #include "llvm/Transforms/IPO/AlwaysInliner.h" |
| 45 | #include "llvm/Transforms/IPO/GlobalDCE.h" |
| 46 | #include "llvm/Transforms/IPO/Internalize.h" |
| 47 | #include "llvm/Transforms/IPO/PassManagerBuilder.h" |
| 48 | #include "llvm/Transforms/Scalar.h" |
| 49 | #include "llvm/Transforms/Scalar/GVN.h" |
| 50 | #include "llvm/Transforms/Scalar/InferAddressSpaces.h" |
| 51 | #include "llvm/Transforms/Utils.h" |
| 52 | #include "llvm/Transforms/Utils/SimplifyLibCalls.h" |
| 53 | #include "llvm/Transforms/Vectorize.h" |
| 54 | |
| 55 | using namespace llvm; |
| 56 | |
| 57 | namespace { |
| 58 | class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> { |
| 59 | public: |
| 60 | SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) |
| 61 | : RegisterRegAllocBase(N, D, C) {} |
| 62 | }; |
| 63 | |
| 64 | class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> { |
| 65 | public: |
| 66 | VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) |
| 67 | : RegisterRegAllocBase(N, D, C) {} |
| 68 | }; |
| 69 | |
| 70 | static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, |
| 71 | const TargetRegisterClass &RC) { |
| 72 | return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); |
| 73 | } |
| 74 | |
| 75 | static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, |
| 76 | const TargetRegisterClass &RC) { |
| 77 | return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); |
| 78 | } |
| 79 | |
| 80 | |
| 81 | /// -{sgpr|vgpr}-regalloc=... command line option. |
| 82 | static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } |
| 83 | |
| 84 | /// A dummy default pass factory indicates whether the register allocator is |
| 85 | /// overridden on the command line. |
| 86 | static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; |
| 87 | static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; |
| 88 | |
| 89 | static SGPRRegisterRegAlloc |
| 90 | defaultSGPRRegAlloc("default", |
| 91 | "pick SGPR register allocator based on -O option", |
| 92 | useDefaultRegisterAllocator); |
| 93 | |
| 94 | static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false, |
| 95 | RegisterPassParser<SGPRRegisterRegAlloc>> |
| 96 | SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), |
| 97 | cl::desc("Register allocator to use for SGPRs")); |
| 98 | |
| 99 | static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false, |
| 100 | RegisterPassParser<VGPRRegisterRegAlloc>> |
| 101 | VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), |
| 102 | cl::desc("Register allocator to use for VGPRs")); |
| 103 | |
| 104 | |
| 105 | static void initializeDefaultSGPRRegisterAllocatorOnce() { |
| 106 | RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); |
| 107 | |
| 108 | if (!Ctor) { |
| 109 | Ctor = SGPRRegAlloc; |
| 110 | SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); |
| 111 | } |
| 112 | } |
| 113 | |
| 114 | static void initializeDefaultVGPRRegisterAllocatorOnce() { |
| 115 | RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); |
| 116 | |
| 117 | if (!Ctor) { |
| 118 | Ctor = VGPRRegAlloc; |
Value stored to 'Ctor' is never read | |
| 119 | VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); |
| 120 | } |
| 121 | } |
| 122 | |
| 123 | static FunctionPass *createBasicSGPRRegisterAllocator() { |
| 124 | return createBasicRegisterAllocator(onlyAllocateSGPRs); |
| 125 | } |
| 126 | |
| 127 | static FunctionPass *createGreedySGPRRegisterAllocator() { |
| 128 | return createGreedyRegisterAllocator(onlyAllocateSGPRs); |
| 129 | } |
| 130 | |
| 131 | static FunctionPass *createFastSGPRRegisterAllocator() { |
| 132 | return createFastRegisterAllocator(onlyAllocateSGPRs, false); |
| 133 | } |
| 134 | |
| 135 | static FunctionPass *createBasicVGPRRegisterAllocator() { |
| 136 | return createBasicRegisterAllocator(onlyAllocateVGPRs); |
| 137 | } |
| 138 | |
| 139 | static FunctionPass *createGreedyVGPRRegisterAllocator() { |
| 140 | return createGreedyRegisterAllocator(onlyAllocateVGPRs); |
| 141 | } |
| 142 | |
| 143 | static FunctionPass *createFastVGPRRegisterAllocator() { |
| 144 | return createFastRegisterAllocator(onlyAllocateVGPRs, true); |
| 145 | } |
| 146 | |
| 147 | static SGPRRegisterRegAlloc basicRegAllocSGPR( |
| 148 | "basic", "basic register allocator", createBasicSGPRRegisterAllocator); |
| 149 | static SGPRRegisterRegAlloc greedyRegAllocSGPR( |
| 150 | "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); |
| 151 | |
| 152 | static SGPRRegisterRegAlloc fastRegAllocSGPR( |
| 153 | "fast", "fast register allocator", createFastSGPRRegisterAllocator); |
| 154 | |
| 155 | |
| 156 | static VGPRRegisterRegAlloc basicRegAllocVGPR( |
| 157 | "basic", "basic register allocator", createBasicVGPRRegisterAllocator); |
| 158 | static VGPRRegisterRegAlloc greedyRegAllocVGPR( |
| 159 | "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); |
| 160 | |
| 161 | static VGPRRegisterRegAlloc fastRegAllocVGPR( |
| 162 | "fast", "fast register allocator", createFastVGPRRegisterAllocator); |
| 163 | } |
| 164 | |
| 165 | |
| 166 | static cl::opt<bool> EnableR600StructurizeCFG( |
| 167 | "r600-ir-structurize", |
| 168 | cl::desc("Use StructurizeCFG IR pass"), |
| 169 | cl::init(true)); |
| 170 | |
| 171 | static cl::opt<bool> EnableSROA( |
| 172 | "amdgpu-sroa", |
| 173 | cl::desc("Run SROA after promote alloca pass"), |
| 174 | cl::ReallyHidden, |
| 175 | cl::init(true)); |
| 176 | |
| 177 | static cl::opt<bool> |
| 178 | EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, |
| 179 | cl::desc("Run early if-conversion"), |
| 180 | cl::init(false)); |
| 181 | |
| 182 | static cl::opt<bool> |
| 183 | OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, |
| 184 | cl::desc("Run pre-RA exec mask optimizations"), |
| 185 | cl::init(true)); |
| 186 | |
| 187 | static cl::opt<bool> EnableR600IfConvert( |
| 188 | "r600-if-convert", |
| 189 | cl::desc("Use if conversion pass"), |
| 190 | cl::ReallyHidden, |
| 191 | cl::init(true)); |
| 192 | |
| 193 | // Option to disable vectorizer for tests. |
| 194 | static cl::opt<bool> EnableLoadStoreVectorizer( |
| 195 | "amdgpu-load-store-vectorizer", |
| 196 | cl::desc("Enable load store vectorizer"), |
| 197 | cl::init(true), |
| 198 | cl::Hidden); |
| 199 | |
| 200 | // Option to control global loads scalarization |
| 201 | static cl::opt<bool> ScalarizeGlobal( |
| 202 | "amdgpu-scalarize-global-loads", |
| 203 | cl::desc("Enable global load scalarization"), |
| 204 | cl::init(true), |
| 205 | cl::Hidden); |
| 206 | |
| 207 | // Option to run internalize pass. |
| 208 | static cl::opt<bool> InternalizeSymbols( |
| 209 | "amdgpu-internalize-symbols", |
| 210 | cl::desc("Enable elimination of non-kernel functions and unused globals"), |
| 211 | cl::init(false), |
| 212 | cl::Hidden); |
| 213 | |
| 214 | // Option to inline all early. |
| 215 | static cl::opt<bool> EarlyInlineAll( |
| 216 | "amdgpu-early-inline-all", |
| 217 | cl::desc("Inline all functions early"), |
| 218 | cl::init(false), |
| 219 | cl::Hidden); |
| 220 | |
| 221 | static cl::opt<bool> EnableSDWAPeephole( |
| 222 | "amdgpu-sdwa-peephole", |
| 223 | cl::desc("Enable SDWA peepholer"), |
| 224 | cl::init(true)); |
| 225 | |
| 226 | static cl::opt<bool> EnableDPPCombine( |
| 227 | "amdgpu-dpp-combine", |
| 228 | cl::desc("Enable DPP combiner"), |
| 229 | cl::init(true)); |
| 230 | |
| 231 | // Enable address space based alias analysis |
| 232 | static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, |
| 233 | cl::desc("Enable AMDGPU Alias Analysis"), |
| 234 | cl::init(true)); |
| 235 | |
| 236 | // Option to run late CFG structurizer |
| 237 | static cl::opt<bool, true> LateCFGStructurize( |
| 238 | "amdgpu-late-structurize", |
| 239 | cl::desc("Enable late CFG structurization"), |
| 240 | cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), |
| 241 | cl::Hidden); |
| 242 | |
| 243 | static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt( |
| 244 | "amdgpu-function-calls", |
| 245 | cl::desc("Enable AMDGPU function call support"), |
| 246 | cl::location(AMDGPUTargetMachine::EnableFunctionCalls), |
| 247 | cl::init(true), |
| 248 | cl::Hidden); |
| 249 | |
| 250 | static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt( |
| 251 | "amdgpu-fixed-function-abi", |
| 252 | cl::desc("Enable all implicit function arguments"), |
| 253 | cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI), |
| 254 | cl::init(false), |
| 255 | cl::Hidden); |
| 256 | |
| 257 | // Enable lib calls simplifications |
| 258 | static cl::opt<bool> EnableLibCallSimplify( |
| 259 | "amdgpu-simplify-libcall", |
| 260 | cl::desc("Enable amdgpu library simplifications"), |
| 261 | cl::init(true), |
| 262 | cl::Hidden); |
| 263 | |
| 264 | static cl::opt<bool> EnableLowerKernelArguments( |
| 265 | "amdgpu-ir-lower-kernel-arguments", |
| 266 | cl::desc("Lower kernel argument loads in IR pass"), |
| 267 | cl::init(true), |
| 268 | cl::Hidden); |
| 269 | |
| 270 | static cl::opt<bool> EnableRegReassign( |
| 271 | "amdgpu-reassign-regs", |
| 272 | cl::desc("Enable register reassign optimizations on gfx10+"), |
| 273 | cl::init(true), |
| 274 | cl::Hidden); |
| 275 | |
| 276 | static cl::opt<bool> OptVGPRLiveRange( |
| 277 | "amdgpu-opt-vgpr-liverange", |
| 278 | cl::desc("Enable VGPR liverange optimizations for if-else structure"), |
| 279 | cl::init(true), cl::Hidden); |
| 280 | |
| 281 | // Enable atomic optimization |
| 282 | static cl::opt<bool> EnableAtomicOptimizations( |
| 283 | "amdgpu-atomic-optimizations", |
| 284 | cl::desc("Enable atomic optimizations"), |
| 285 | cl::init(false), |
| 286 | cl::Hidden); |
| 287 | |
| 288 | // Enable Mode register optimization |
| 289 | static cl::opt<bool> EnableSIModeRegisterPass( |
| 290 | "amdgpu-mode-register", |
| 291 | cl::desc("Enable mode register pass"), |
| 292 | cl::init(true), |
| 293 | cl::Hidden); |
| 294 | |
| 295 | // Option is used in lit tests to prevent deadcoding of patterns inspected. |
| 296 | static cl::opt<bool> |
| 297 | EnableDCEInRA("amdgpu-dce-in-ra", |
| 298 | cl::init(true), cl::Hidden, |
| 299 | cl::desc("Enable machine DCE inside regalloc")); |
| 300 | |
| 301 | static cl::opt<bool> EnableScalarIRPasses( |
| 302 | "amdgpu-scalar-ir-passes", |
| 303 | cl::desc("Enable scalar IR passes"), |
| 304 | cl::init(true), |
| 305 | cl::Hidden); |
| 306 | |
| 307 | static cl::opt<bool> EnableStructurizerWorkarounds( |
| 308 | "amdgpu-enable-structurizer-workarounds", |
| 309 | cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), |
| 310 | cl::Hidden); |
| 311 | |
| 312 | static cl::opt<bool> EnableLDSReplaceWithPointer( |
| 313 | "amdgpu-enable-lds-replace-with-pointer", |
| 314 | cl::desc("Enable LDS replace with pointer pass"), cl::init(false), |
| 315 | cl::Hidden); |
| 316 | |
| 317 | static cl::opt<bool, true> EnableLowerModuleLDS( |
| 318 | "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), |
| 319 | cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), |
| 320 | cl::Hidden); |
| 321 | |
| 322 | static cl::opt<bool> EnablePreRAOptimizations( |
| 323 | "amdgpu-enable-pre-ra-optimizations", |
| 324 | cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), |
| 325 | cl::Hidden); |
| 326 | |
| 327 | extern "C" LLVM_EXTERNAL_VISIBILITY__attribute__ ((visibility("default"))) void LLVMInitializeAMDGPUTarget() { |
| 328 | // Register the target |
| 329 | RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); |
| 330 | RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); |
| 331 | |
| 332 | PassRegistry *PR = PassRegistry::getPassRegistry(); |
| 333 | initializeR600ClauseMergePassPass(*PR); |
| 334 | initializeR600ControlFlowFinalizerPass(*PR); |
| 335 | initializeR600PacketizerPass(*PR); |
| 336 | initializeR600ExpandSpecialInstrsPassPass(*PR); |
| 337 | initializeR600VectorRegMergerPass(*PR); |
| 338 | initializeGlobalISel(*PR); |
| 339 | initializeAMDGPUDAGToDAGISelPass(*PR); |
| 340 | initializeGCNDPPCombinePass(*PR); |
| 341 | initializeSILowerI1CopiesPass(*PR); |
| 342 | initializeSILowerSGPRSpillsPass(*PR); |
| 343 | initializeSIFixSGPRCopiesPass(*PR); |
| 344 | initializeSIFixVGPRCopiesPass(*PR); |
| 345 | initializeSIFoldOperandsPass(*PR); |
| 346 | initializeSIPeepholeSDWAPass(*PR); |
| 347 | initializeSIShrinkInstructionsPass(*PR); |
| 348 | initializeSIOptimizeExecMaskingPreRAPass(*PR); |
| 349 | initializeSIOptimizeVGPRLiveRangePass(*PR); |
| 350 | initializeSILoadStoreOptimizerPass(*PR); |
| 351 | initializeAMDGPUFixFunctionBitcastsPass(*PR); |
| 352 | initializeAMDGPUAlwaysInlinePass(*PR); |
| 353 | initializeAMDGPUAttributorPass(*PR); |
| 354 | initializeAMDGPUAnnotateKernelFeaturesPass(*PR); |
| 355 | initializeAMDGPUAnnotateUniformValuesPass(*PR); |
| 356 | initializeAMDGPUArgumentUsageInfoPass(*PR); |
| 357 | initializeAMDGPUAtomicOptimizerPass(*PR); |
| 358 | initializeAMDGPULowerKernelArgumentsPass(*PR); |
| 359 | initializeAMDGPULowerKernelAttributesPass(*PR); |
| 360 | initializeAMDGPULowerIntrinsicsPass(*PR); |
| 361 | initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); |
| 362 | initializeAMDGPUPostLegalizerCombinerPass(*PR); |
| 363 | initializeAMDGPUPreLegalizerCombinerPass(*PR); |
| 364 | initializeAMDGPURegBankCombinerPass(*PR); |
| 365 | initializeAMDGPUPromoteAllocaPass(*PR); |
| 366 | initializeAMDGPUPromoteAllocaToVectorPass(*PR); |
| 367 | initializeAMDGPUCodeGenPreparePass(*PR); |
| 368 | initializeAMDGPULateCodeGenPreparePass(*PR); |
| 369 | initializeAMDGPUPropagateAttributesEarlyPass(*PR); |
| 370 | initializeAMDGPUPropagateAttributesLatePass(*PR); |
| 371 | initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); |
| 372 | initializeAMDGPULowerModuleLDSPass(*PR); |
| 373 | initializeAMDGPURewriteOutArgumentsPass(*PR); |
| 374 | initializeAMDGPUUnifyMetadataPass(*PR); |
| 375 | initializeSIAnnotateControlFlowPass(*PR); |
| 376 | initializeSIInsertHardClausesPass(*PR); |
| 377 | initializeSIInsertWaitcntsPass(*PR); |
| 378 | initializeSIModeRegisterPass(*PR); |
| 379 | initializeSIWholeQuadModePass(*PR); |
| 380 | initializeSILowerControlFlowPass(*PR); |
| 381 | initializeSIPreEmitPeepholePass(*PR); |
| 382 | initializeSILateBranchLoweringPass(*PR); |
| 383 | initializeSIMemoryLegalizerPass(*PR); |
| 384 | initializeSIOptimizeExecMaskingPass(*PR); |
| 385 | initializeSIPreAllocateWWMRegsPass(*PR); |
| 386 | initializeSIFormMemoryClausesPass(*PR); |
| 387 | initializeSIPostRABundlerPass(*PR); |
| 388 | initializeAMDGPUUnifyDivergentExitNodesPass(*PR); |
| 389 | initializeAMDGPUAAWrapperPassPass(*PR); |
| 390 | initializeAMDGPUExternalAAWrapperPass(*PR); |
| 391 | initializeAMDGPUUseNativeCallsPass(*PR); |
| 392 | initializeAMDGPUSimplifyLibCallsPass(*PR); |
| 393 | initializeAMDGPUPrintfRuntimeBindingPass(*PR); |
| 394 | initializeAMDGPUResourceUsageAnalysisPass(*PR); |
| 395 | initializeGCNNSAReassignPass(*PR); |
| 396 | initializeGCNPreRAOptimizationsPass(*PR); |
| 397 | } |
| 398 | |
| 399 | static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { |
| 400 | return std::make_unique<AMDGPUTargetObjectFile>(); |
| 401 | } |
| 402 | |
| 403 | static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { |
| 404 | return new ScheduleDAGMILive(C, std::make_unique<R600SchedStrategy>()); |
| 405 | } |
| 406 | |
| 407 | static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { |
| 408 | return new SIScheduleDAGMI(C); |
| 409 | } |
| 410 | |
| 411 | static ScheduleDAGInstrs * |
| 412 | createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { |
| 413 | ScheduleDAGMILive *DAG = |
| 414 | new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C)); |
| 415 | DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); |
| 416 | DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); |
| 417 | DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); |
| 418 | return DAG; |
| 419 | } |
| 420 | |
| 421 | static ScheduleDAGInstrs * |
| 422 | createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { |
| 423 | auto DAG = new GCNIterativeScheduler(C, |
| 424 | GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); |
| 425 | DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); |
| 426 | return DAG; |
| 427 | } |
| 428 | |
| 429 | static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { |
| 430 | return new GCNIterativeScheduler(C, |
| 431 | GCNIterativeScheduler::SCHEDULE_MINREGFORCED); |
| 432 | } |
| 433 | |
| 434 | static ScheduleDAGInstrs * |
| 435 | createIterativeILPMachineScheduler(MachineSchedContext *C) { |
| 436 | auto DAG = new GCNIterativeScheduler(C, |
| 437 | GCNIterativeScheduler::SCHEDULE_ILP); |
| 438 | DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); |
| 439 | DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); |
| 440 | return DAG; |
| 441 | } |
| 442 | |
| 443 | static MachineSchedRegistry |
| 444 | R600SchedRegistry("r600", "Run R600's custom scheduler", |
| 445 | createR600MachineScheduler); |
| 446 | |
| 447 | static MachineSchedRegistry |
| 448 | SISchedRegistry("si", "Run SI's custom scheduler", |
| 449 | createSIMachineScheduler); |
| 450 | |
| 451 | static MachineSchedRegistry |
| 452 | GCNMaxOccupancySchedRegistry("gcn-max-occupancy", |
| 453 | "Run GCN scheduler to maximize occupancy", |
| 454 | createGCNMaxOccupancyMachineScheduler); |
| 455 | |
| 456 | static MachineSchedRegistry |
| 457 | IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", |
| 458 | "Run GCN scheduler to maximize occupancy (experimental)", |
| 459 | createIterativeGCNMaxOccupancyMachineScheduler); |
| 460 | |
| 461 | static MachineSchedRegistry |
| 462 | GCNMinRegSchedRegistry("gcn-minreg", |
| 463 | "Run GCN iterative scheduler for minimal register usage (experimental)", |
| 464 | createMinRegScheduler); |
| 465 | |
| 466 | static MachineSchedRegistry |
| 467 | GCNILPSchedRegistry("gcn-ilp", |
| 468 | "Run GCN iterative scheduler for ILP scheduling (experimental)", |
| 469 | createIterativeILPMachineScheduler); |
| 470 | |
| 471 | static StringRef computeDataLayout(const Triple &TT) { |
| 472 | if (TT.getArch() == Triple::r600) { |
| 473 | // 32-bit pointers. |
| 474 | return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" |
| 475 | "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"; |
| 476 | } |
| 477 | |
| 478 | // 32-bit private, local, and region pointers. 64-bit global, constant and |
| 479 | // flat, non-integral buffer fat pointers. |
| 480 | return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" |
| 481 | "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" |
| 482 | "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1" |
| 483 | "-ni:7"; |
| 484 | } |
| 485 | |
| 486 | LLVM_READNONE__attribute__((__const__)) |
| 487 | static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { |
| 488 | if (!GPU.empty()) |
| 489 | return GPU; |
| 490 | |
| 491 | // Need to default to a target with flat support for HSA. |
| 492 | if (TT.getArch() == Triple::amdgcn) |
| 493 | return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic"; |
| 494 | |
| 495 | return "r600"; |
| 496 | } |
| 497 | |
| 498 | static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { |
| 499 | // The AMDGPU toolchain only supports generating shared objects, so we |
| 500 | // must always use PIC. |
| 501 | return Reloc::PIC_; |
| 502 | } |
| 503 | |
| 504 | AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, |
| 505 | StringRef CPU, StringRef FS, |
| 506 | TargetOptions Options, |
| 507 | Optional<Reloc::Model> RM, |
| 508 | Optional<CodeModel::Model> CM, |
| 509 | CodeGenOpt::Level OptLevel) |
| 510 | : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), |
| 511 | FS, Options, getEffectiveRelocModel(RM), |
| 512 | getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), |
| 513 | TLOF(createTLOF(getTargetTriple())) { |
| 514 | initAsmInfo(); |
| 515 | if (TT.getArch() == Triple::amdgcn) { |
| 516 | if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64")) |
| 517 | MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64)); |
| 518 | else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) |
| 519 | MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); |
| 520 | } |
| 521 | } |
| 522 | |
| 523 | bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; |
| 524 | bool AMDGPUTargetMachine::EnableFunctionCalls = false; |
| 525 | bool AMDGPUTargetMachine::EnableFixedFunctionABI = false; |
| 526 | bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; |
| 527 | |
| 528 | AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; |
| 529 | |
| 530 | StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { |
| 531 | Attribute GPUAttr = F.getFnAttribute("target-cpu"); |
| 532 | return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU(); |
| 533 | } |
| 534 | |
| 535 | StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { |
| 536 | Attribute FSAttr = F.getFnAttribute("target-features"); |
| 537 | |
| 538 | return FSAttr.isValid() ? FSAttr.getValueAsString() |
| 539 | : getTargetFeatureString(); |
| 540 | } |
| 541 | |
| 542 | /// Predicate for Internalize pass. |
| 543 | static bool mustPreserveGV(const GlobalValue &GV) { |
| 544 | if (const Function *F = dyn_cast<Function>(&GV)) |
| 545 | return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv()); |
| 546 | |
| 547 | GV.removeDeadConstantUsers(); |
| 548 | return !GV.use_empty(); |
| 549 | } |
| 550 | |
| 551 | void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { |
| 552 | Builder.DivergentTarget = true; |
| 553 | |
| 554 | bool EnableOpt = getOptLevel() > CodeGenOpt::None; |
| 555 | bool Internalize = InternalizeSymbols; |
| 556 | bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls; |
| 557 | bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; |
| 558 | bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; |
| 559 | |
| 560 | if (EnableFunctionCalls) { |
| 561 | delete Builder.Inliner; |
| 562 | Builder.Inliner = createFunctionInliningPass(); |
| 563 | } |
| 564 | |
| 565 | Builder.addExtension( |
| 566 | PassManagerBuilder::EP_ModuleOptimizerEarly, |
| 567 | [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &, |
| 568 | legacy::PassManagerBase &PM) { |
| 569 | if (AMDGPUAA) { |
| 570 | PM.add(createAMDGPUAAWrapperPass()); |
| 571 | PM.add(createAMDGPUExternalAAWrapperPass()); |
| 572 | } |
| 573 | PM.add(createAMDGPUUnifyMetadataPass()); |
| 574 | PM.add(createAMDGPUPrintfRuntimeBinding()); |
| 575 | if (Internalize) |
| 576 | PM.add(createInternalizePass(mustPreserveGV)); |
| 577 | PM.add(createAMDGPUPropagateAttributesLatePass(this)); |
| 578 | if (Internalize) |
| 579 | PM.add(createGlobalDCEPass()); |
| 580 | if (EarlyInline) |
| 581 | PM.add(createAMDGPUAlwaysInlinePass(false)); |
| 582 | }); |
| 583 | |
| 584 | Builder.addExtension( |
| 585 | PassManagerBuilder::EP_EarlyAsPossible, |
| 586 | [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &, |
| 587 | legacy::PassManagerBase &PM) { |
| 588 | if (AMDGPUAA) { |
| 589 | PM.add(createAMDGPUAAWrapperPass()); |
| 590 | PM.add(createAMDGPUExternalAAWrapperPass()); |
| 591 | } |
| 592 | PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this)); |
| 593 | PM.add(llvm::createAMDGPUUseNativeCallsPass()); |
| 594 | if (LibCallSimplify) |
| 595 | PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this)); |
| 596 | }); |
| 597 | |
| 598 | Builder.addExtension( |
| 599 | PassManagerBuilder::EP_CGSCCOptimizerLate, |
| 600 | [EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) { |
| 601 | // Add infer address spaces pass to the opt pipeline after inlining |
| 602 | // but before SROA to increase SROA opportunities. |
| 603 | PM.add(createInferAddressSpacesPass()); |
| 604 | |
| 605 | // This should run after inlining to have any chance of doing anything, |
| 606 | // and before other cleanup optimizations. |
| 607 | PM.add(createAMDGPULowerKernelAttributesPass()); |
| 608 | |
| 609 | // Promote alloca to vector before SROA and loop unroll. If we manage |
| 610 | // to eliminate allocas before unroll we may choose to unroll less. |
| 611 | if (EnableOpt) |
| 612 | PM.add(createAMDGPUPromoteAllocaToVector()); |
| 613 | }); |
| 614 | } |
| 615 | |
| 616 | void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { |
| 617 | AAM.registerFunctionAnalysis<AMDGPUAA>(); |
| 618 | } |
| 619 | |
| 620 | void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { |
| 621 | PB.registerPipelineParsingCallback( |
| 622 | [this](StringRef PassName, ModulePassManager &PM, |
| 623 | ArrayRef<PassBuilder::PipelineElement>) { |
| 624 | if (PassName == "amdgpu-propagate-attributes-late") { |
| 625 | PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); |
| 626 | return true; |
| 627 | } |
| 628 | if (PassName == "amdgpu-unify-metadata") { |
| 629 | PM.addPass(AMDGPUUnifyMetadataPass()); |
| 630 | return true; |
| 631 | } |
| 632 | if (PassName == "amdgpu-printf-runtime-binding") { |
| 633 | PM.addPass(AMDGPUPrintfRuntimeBindingPass()); |
| 634 | return true; |
| 635 | } |
| 636 | if (PassName == "amdgpu-always-inline") { |
| 637 | PM.addPass(AMDGPUAlwaysInlinePass()); |
| 638 | return true; |
| 639 | } |
| 640 | if (PassName == "amdgpu-replace-lds-use-with-pointer") { |
| 641 | PM.addPass(AMDGPUReplaceLDSUseWithPointerPass()); |
| 642 | return true; |
| 643 | } |
| 644 | if (PassName == "amdgpu-lower-module-lds") { |
| 645 | PM.addPass(AMDGPULowerModuleLDSPass()); |
| 646 | return true; |
| 647 | } |
| 648 | return false; |
| 649 | }); |
| 650 | PB.registerPipelineParsingCallback( |
| 651 | [this](StringRef PassName, FunctionPassManager &PM, |
| 652 | ArrayRef<PassBuilder::PipelineElement>) { |
| 653 | if (PassName == "amdgpu-simplifylib") { |
| 654 | PM.addPass(AMDGPUSimplifyLibCallsPass(*this)); |
| 655 | return true; |
| 656 | } |
| 657 | if (PassName == "amdgpu-usenative") { |
| 658 | PM.addPass(AMDGPUUseNativeCallsPass()); |
| 659 | return true; |
| 660 | } |
| 661 | if (PassName == "amdgpu-promote-alloca") { |
| 662 | PM.addPass(AMDGPUPromoteAllocaPass(*this)); |
| 663 | return true; |
| 664 | } |
| 665 | if (PassName == "amdgpu-promote-alloca-to-vector") { |
| 666 | PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); |
| 667 | return true; |
| 668 | } |
| 669 | if (PassName == "amdgpu-lower-kernel-attributes") { |
| 670 | PM.addPass(AMDGPULowerKernelAttributesPass()); |
| 671 | return true; |
| 672 | } |
| 673 | if (PassName == "amdgpu-propagate-attributes-early") { |
| 674 | PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); |
| 675 | return true; |
| 676 | } |
| 677 | return false; |
| 678 | }); |
| 679 | |
| 680 | PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) { |
| 681 | FAM.registerPass([&] { return AMDGPUAA(); }); |
| 682 | }); |
| 683 | |
| 684 | PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) { |
| 685 | if (AAName == "amdgpu-aa") { |
| 686 | AAM.registerFunctionAnalysis<AMDGPUAA>(); |
| 687 | return true; |
| 688 | } |
| 689 | return false; |
| 690 | }); |
| 691 | |
| 692 | PB.registerPipelineStartEPCallback( |
| 693 | [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) { |
| 694 | FunctionPassManager FPM; |
| 695 | FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); |
| 696 | FPM.addPass(AMDGPUUseNativeCallsPass()); |
| 697 | if (EnableLibCallSimplify && |
| 698 | Level != PassBuilder::OptimizationLevel::O0) |
| 699 | FPM.addPass(AMDGPUSimplifyLibCallsPass(*this)); |
| 700 | PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); |
| 701 | }); |
| 702 | |
| 703 | PB.registerPipelineEarlySimplificationEPCallback( |
| 704 | [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) { |
| 705 | if (Level == PassBuilder::OptimizationLevel::O0) |
| 706 | return; |
| 707 | |
| 708 | PM.addPass(AMDGPUUnifyMetadataPass()); |
| 709 | PM.addPass(AMDGPUPrintfRuntimeBindingPass()); |
| 710 | |
| 711 | if (InternalizeSymbols) { |
| 712 | PM.addPass(InternalizePass(mustPreserveGV)); |
| 713 | } |
| 714 | PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); |
| 715 | if (InternalizeSymbols) { |
| 716 | PM.addPass(GlobalDCEPass()); |
| 717 | } |
| 718 | if (EarlyInlineAll && !EnableFunctionCalls) |
| 719 | PM.addPass(AMDGPUAlwaysInlinePass()); |
| 720 | }); |
| 721 | |
| 722 | PB.registerCGSCCOptimizerLateEPCallback( |
| 723 | [this](CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) { |
| 724 | if (Level == PassBuilder::OptimizationLevel::O0) |
| 725 | return; |
| 726 | |
| 727 | FunctionPassManager FPM; |
| 728 | |
| 729 | // Add infer address spaces pass to the opt pipeline after inlining |
| 730 | // but before SROA to increase SROA opportunities. |
| 731 | FPM.addPass(InferAddressSpacesPass()); |
| 732 | |
| 733 | // This should run after inlining to have any chance of doing |
| 734 | // anything, and before other cleanup optimizations. |
| 735 | FPM.addPass(AMDGPULowerKernelAttributesPass()); |
| 736 | |
| 737 | if (Level != PassBuilder::OptimizationLevel::O0) { |
| 738 | // Promote alloca to vector before SROA and loop unroll. If we |
| 739 | // manage to eliminate allocas before unroll we may choose to unroll |
| 740 | // less. |
| 741 | FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); |
| 742 | } |
| 743 | |
| 744 | PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); |
| 745 | }); |
| 746 | } |
| 747 | |
| 748 | //===----------------------------------------------------------------------===// |
| 749 | // R600 Target Machine (R600 -> Cayman) |
| 750 | //===----------------------------------------------------------------------===// |
| 751 | |
| 752 | R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, |
| 753 | StringRef CPU, StringRef FS, |
| 754 | TargetOptions Options, |
| 755 | Optional<Reloc::Model> RM, |
| 756 | Optional<CodeModel::Model> CM, |
| 757 | CodeGenOpt::Level OL, bool JIT) |
| 758 | : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) { |
| 759 | setRequiresStructuredCFG(true); |
| 760 | |
| 761 | // Override the default since calls aren't supported for r600. |
| 762 | if (EnableFunctionCalls && |
| 763 | EnableAMDGPUFunctionCallsOpt.getNumOccurrences() == 0) |
| 764 | EnableFunctionCalls = false; |
| 765 | } |
| 766 | |
| 767 | const R600Subtarget *R600TargetMachine::getSubtargetImpl( |
| 768 | const Function &F) const { |
| 769 | StringRef GPU = getGPUName(F); |
| 770 | StringRef FS = getFeatureString(F); |
| 771 | |
| 772 | SmallString<128> SubtargetKey(GPU); |
| 773 | SubtargetKey.append(FS); |
| 774 | |
| 775 | auto &I = SubtargetMap[SubtargetKey]; |
| 776 | if (!I) { |
| 777 | // This needs to be done before we create a new subtarget since any |
| 778 | // creation will depend on the TM and the code generation flags on the |
| 779 | // function that reside in TargetOptions. |
| 780 | resetTargetOptions(F); |
| 781 | I = std::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this); |
| 782 | } |
| 783 | |
| 784 | return I.get(); |
| 785 | } |
| 786 | |
| 787 | int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { |
| 788 | return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || |
| 789 | AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || |
| 790 | AddrSpace == AMDGPUAS::REGION_ADDRESS) |
| 791 | ? -1 |
| 792 | : 0; |
| 793 | } |
| 794 | |
| 795 | bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, |
| 796 | unsigned DestAS) const { |
| 797 | return AMDGPU::isFlatGlobalAddrSpace(SrcAS) && |
| 798 | AMDGPU::isFlatGlobalAddrSpace(DestAS); |
| 799 | } |
| 800 | |
| 801 | unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { |
| 802 | const auto *LD = dyn_cast<LoadInst>(V); |
| 803 | if (!LD) |
| 804 | return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; |
| 805 | |
| 806 | // It must be a generic pointer loaded. |
| 807 | assert(V->getType()->isPointerTy() &&((void)0) |
| 808 | V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS)((void)0); |
| 809 | |
| 810 | const auto *Ptr = LD->getPointerOperand(); |
| 811 | if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) |
| 812 | return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; |
| 813 | // For a generic pointer loaded from the constant memory, it could be assumed |
| 814 | // as a global pointer since the constant memory is only populated on the |
| 815 | // host side. As implied by the offload programming model, only global |
| 816 | // pointers could be referenced on the host side. |
| 817 | return AMDGPUAS::GLOBAL_ADDRESS; |
| 818 | } |
| 819 | |
| 820 | TargetTransformInfo |
| 821 | R600TargetMachine::getTargetTransformInfo(const Function &F) { |
| 822 | return TargetTransformInfo(R600TTIImpl(this, F)); |
| 823 | } |
| 824 | |
| 825 | //===----------------------------------------------------------------------===// |
| 826 | // GCN Target Machine (SI+) |
| 827 | //===----------------------------------------------------------------------===// |
| 828 | |
| 829 | GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, |
| 830 | StringRef CPU, StringRef FS, |
| 831 | TargetOptions Options, |
| 832 | Optional<Reloc::Model> RM, |
| 833 | Optional<CodeModel::Model> CM, |
| 834 | CodeGenOpt::Level OL, bool JIT) |
| 835 | : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} |
| 836 | |
| 837 | const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { |
| 838 | StringRef GPU = getGPUName(F); |
| 839 | StringRef FS = getFeatureString(F); |
| 840 | |
| 841 | SmallString<128> SubtargetKey(GPU); |
| 842 | SubtargetKey.append(FS); |
| 843 | |
| 844 | auto &I = SubtargetMap[SubtargetKey]; |
| 845 | if (!I) { |
| 846 | // This needs to be done before we create a new subtarget since any |
| 847 | // creation will depend on the TM and the code generation flags on the |
| 848 | // function that reside in TargetOptions. |
| 849 | resetTargetOptions(F); |
| 850 | I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); |
| 851 | } |
| 852 | |
| 853 | I->setScalarizeGlobalBehavior(ScalarizeGlobal); |
| 854 | |
| 855 | return I.get(); |
| 856 | } |
| 857 | |
| 858 | TargetTransformInfo |
| 859 | GCNTargetMachine::getTargetTransformInfo(const Function &F) { |
| 860 | return TargetTransformInfo(GCNTTIImpl(this, F)); |
| 861 | } |
| 862 | |
| 863 | //===----------------------------------------------------------------------===// |
| 864 | // AMDGPU Pass Setup |
| 865 | //===----------------------------------------------------------------------===// |
| 866 | |
| 867 | namespace { |
| 868 | |
| 869 | class AMDGPUPassConfig : public TargetPassConfig { |
| 870 | public: |
| 871 | AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) |
| 872 | : TargetPassConfig(TM, PM) { |
| 873 | // Exceptions and StackMaps are not supported, so these passes will never do |
| 874 | // anything. |
| 875 | disablePass(&StackMapLivenessID); |
| 876 | disablePass(&FuncletLayoutID); |
| 877 | // Garbage collection is not supported. |
| 878 | disablePass(&GCLoweringID); |
| 879 | disablePass(&ShadowStackGCLoweringID); |
| 880 | } |
| 881 | |
| 882 | AMDGPUTargetMachine &getAMDGPUTargetMachine() const { |
| 883 | return getTM<AMDGPUTargetMachine>(); |
| 884 | } |
| 885 | |
| 886 | ScheduleDAGInstrs * |
| 887 | createMachineScheduler(MachineSchedContext *C) const override { |
| 888 | ScheduleDAGMILive *DAG = createGenericSchedLive(C); |
| 889 | DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); |
| 890 | return DAG; |
| 891 | } |
| 892 | |
| 893 | void addEarlyCSEOrGVNPass(); |
| 894 | void addStraightLineScalarOptimizationPasses(); |
| 895 | void addIRPasses() override; |
| 896 | void addCodeGenPrepare() override; |
| 897 | bool addPreISel() override; |
| 898 | bool addInstSelector() override; |
| 899 | bool addGCPasses() override; |
| 900 | |
| 901 | std::unique_ptr<CSEConfigBase> getCSEConfig() const override; |
| 902 | |
| 903 | /// Check if a pass is enabled given \p Opt option. The option always |
| 904 | /// overrides defaults if explicitely used. Otherwise its default will |
| 905 | /// be used given that a pass shall work at an optimization \p Level |
| 906 | /// minimum. |
| 907 | bool isPassEnabled(const cl::opt<bool> &Opt, |
| 908 | CodeGenOpt::Level Level = CodeGenOpt::Default) const { |
| 909 | if (Opt.getNumOccurrences()) |
| 910 | return Opt; |
| 911 | if (TM->getOptLevel() < Level) |
| 912 | return false; |
| 913 | return Opt; |
| 914 | } |
| 915 | }; |
| 916 | |
| 917 | std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const { |
| 918 | return getStandardCSEConfigForOpt(TM->getOptLevel()); |
| 919 | } |
| 920 | |
| 921 | class R600PassConfig final : public AMDGPUPassConfig { |
| 922 | public: |
| 923 | R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) |
| 924 | : AMDGPUPassConfig(TM, PM) {} |
| 925 | |
| 926 | ScheduleDAGInstrs *createMachineScheduler( |
| 927 | MachineSchedContext *C) const override { |
| 928 | return createR600MachineScheduler(C); |
| 929 | } |
| 930 | |
| 931 | bool addPreISel() override; |
| 932 | bool addInstSelector() override; |
| 933 | void addPreRegAlloc() override; |
| 934 | void addPreSched2() override; |
| 935 | void addPreEmitPass() override; |
| 936 | }; |
| 937 | |
| 938 | class GCNPassConfig final : public AMDGPUPassConfig { |
| 939 | public: |
| 940 | GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) |
| 941 | : AMDGPUPassConfig(TM, PM) { |
| 942 | // It is necessary to know the register usage of the entire call graph. We |
| 943 | // allow calls without EnableAMDGPUFunctionCalls if they are marked |
| 944 | // noinline, so this is always required. |
| 945 | setRequiresCodeGenSCCOrder(true); |
| 946 | } |
| 947 | |
| 948 | GCNTargetMachine &getGCNTargetMachine() const { |
| 949 | return getTM<GCNTargetMachine>(); |
| 950 | } |
| 951 | |
| 952 | ScheduleDAGInstrs * |
| 953 | createMachineScheduler(MachineSchedContext *C) const override; |
| 954 | |
| 955 | bool addPreISel() override; |
| 956 | void addMachineSSAOptimization() override; |
| 957 | bool addILPOpts() override; |
| 958 | bool addInstSelector() override; |
| 959 | bool addIRTranslator() override; |
| 960 | void addPreLegalizeMachineIR() override; |
| 961 | bool addLegalizeMachineIR() override; |
| 962 | void addPreRegBankSelect() override; |
| 963 | bool addRegBankSelect() override; |
| 964 | void addPreGlobalInstructionSelect() override; |
| 965 | bool addGlobalInstructionSelect() override; |
| 966 | void addFastRegAlloc() override; |
| 967 | void addOptimizedRegAlloc() override; |
| 968 | |
| 969 | FunctionPass *createSGPRAllocPass(bool Optimized); |
| 970 | FunctionPass *createVGPRAllocPass(bool Optimized); |
| 971 | FunctionPass *createRegAllocPass(bool Optimized) override; |
| 972 | |
| 973 | bool addRegAssignAndRewriteFast() override; |
| 974 | bool addRegAssignAndRewriteOptimized() override; |
| 975 | |
| 976 | void addPreRegAlloc() override; |
| 977 | bool addPreRewrite() override; |
| 978 | void addPostRegAlloc() override; |
| 979 | void addPreSched2() override; |
| 980 | void addPreEmitPass() override; |
| 981 | }; |
| 982 | |
| 983 | } // end anonymous namespace |
| 984 | |
| 985 | void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { |
| 986 | if (getOptLevel() == CodeGenOpt::Aggressive) |
| 987 | addPass(createGVNPass()); |
| 988 | else |
| 989 | addPass(createEarlyCSEPass()); |
| 990 | } |
| 991 | |
| 992 | void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { |
| 993 | addPass(createLICMPass()); |
| 994 | addPass(createSeparateConstOffsetFromGEPPass()); |
| 995 | addPass(createSpeculativeExecutionPass()); |
| 996 | // ReassociateGEPs exposes more opportunites for SLSR. See |
| 997 | // the example in reassociate-geps-and-slsr.ll. |
| 998 | addPass(createStraightLineStrengthReducePass()); |
| 999 | // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or |
| 1000 | // EarlyCSE can reuse. |
| 1001 | addEarlyCSEOrGVNPass(); |
| 1002 | // Run NaryReassociate after EarlyCSE/GVN to be more effective. |
| 1003 | addPass(createNaryReassociatePass()); |
| 1004 | // NaryReassociate on GEPs creates redundant common expressions, so run |
| 1005 | // EarlyCSE after it. |
| 1006 | addPass(createEarlyCSEPass()); |
| 1007 | } |
| 1008 | |
| 1009 | void AMDGPUPassConfig::addIRPasses() { |
| 1010 | const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); |
| 1011 | |
| 1012 | // There is no reason to run these. |
| 1013 | disablePass(&StackMapLivenessID); |
| 1014 | disablePass(&FuncletLayoutID); |
| 1015 | disablePass(&PatchableFunctionID); |
| 1016 | |
| 1017 | addPass(createAMDGPUPrintfRuntimeBinding()); |
| 1018 | |
| 1019 | // This must occur before inlining, as the inliner will not look through |
| 1020 | // bitcast calls. |
| 1021 | addPass(createAMDGPUFixFunctionBitcastsPass()); |
| 1022 | |
| 1023 | // A call to propagate attributes pass in the backend in case opt was not run. |
| 1024 | addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); |
| 1025 | |
| 1026 | addPass(createAMDGPULowerIntrinsicsPass()); |
| 1027 | |
| 1028 | // Function calls are not supported, so make sure we inline everything. |
| 1029 | addPass(createAMDGPUAlwaysInlinePass()); |
| 1030 | addPass(createAlwaysInlinerLegacyPass()); |
| 1031 | // We need to add the barrier noop pass, otherwise adding the function |
| 1032 | // inlining pass will cause all of the PassConfigs passes to be run |
| 1033 | // one function at a time, which means if we have a nodule with two |
| 1034 | // functions, then we will generate code for the first function |
| 1035 | // without ever running any passes on the second. |
| 1036 | addPass(createBarrierNoopPass()); |
| 1037 | |
| 1038 | // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. |
| 1039 | if (TM.getTargetTriple().getArch() == Triple::r600) |
| 1040 | addPass(createR600OpenCLImageTypeLoweringPass()); |
| 1041 | |
| 1042 | // Replace OpenCL enqueued block function pointers with global variables. |
| 1043 | addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); |
| 1044 | |
| 1045 | // Can increase LDS used by kernel so runs before PromoteAlloca |
| 1046 | if (EnableLowerModuleLDS) { |
| 1047 | // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the |
| 1048 | // pass "amdgpu-lower-module-lds", and also it required to be run only if |
| 1049 | // "amdgpu-lower-module-lds" pass is enabled. |
| 1050 | if (EnableLDSReplaceWithPointer) |
| 1051 | addPass(createAMDGPUReplaceLDSUseWithPointerPass()); |
| 1052 | |
| 1053 | addPass(createAMDGPULowerModuleLDSPass()); |
| 1054 | } |
| 1055 | |
| 1056 | if (TM.getOptLevel() > CodeGenOpt::None) |
| 1057 | addPass(createInferAddressSpacesPass()); |
| 1058 | |
| 1059 | addPass(createAtomicExpandPass()); |
| 1060 | |
| 1061 | if (TM.getOptLevel() > CodeGenOpt::None) { |
| 1062 | addPass(createAMDGPUPromoteAlloca()); |
| 1063 | |
| 1064 | if (EnableSROA) |
| 1065 | addPass(createSROAPass()); |
| 1066 | if (isPassEnabled(EnableScalarIRPasses)) |
| 1067 | addStraightLineScalarOptimizationPasses(); |
| 1068 | |
| 1069 | if (EnableAMDGPUAliasAnalysis) { |
| 1070 | addPass(createAMDGPUAAWrapperPass()); |
| 1071 | addPass(createExternalAAWrapperPass([](Pass &P, Function &, |
| 1072 | AAResults &AAR) { |
| 1073 | if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) |
| 1074 | AAR.addAAResult(WrapperPass->getResult()); |
| 1075 | })); |
| 1076 | } |
| 1077 | |
| 1078 | if (TM.getTargetTriple().getArch() == Triple::amdgcn) { |
| 1079 | // TODO: May want to move later or split into an early and late one. |
| 1080 | addPass(createAMDGPUCodeGenPreparePass()); |
| 1081 | } |
| 1082 | } |
| 1083 | |
| 1084 | TargetPassConfig::addIRPasses(); |
| 1085 | |
| 1086 | // EarlyCSE is not always strong enough to clean up what LSR produces. For |
| 1087 | // example, GVN can combine |
| 1088 | // |
| 1089 | // %0 = add %a, %b |
| 1090 | // %1 = add %b, %a |
| 1091 | // |
| 1092 | // and |
| 1093 | // |
| 1094 | // %0 = shl nsw %a, 2 |
| 1095 | // %1 = shl %a, 2 |
| 1096 | // |
| 1097 | // but EarlyCSE can do neither of them. |
| 1098 | if (isPassEnabled(EnableScalarIRPasses)) |
| 1099 | addEarlyCSEOrGVNPass(); |
| 1100 | } |
| 1101 | |
| 1102 | void AMDGPUPassConfig::addCodeGenPrepare() { |
| 1103 | if (TM->getTargetTriple().getArch() == Triple::amdgcn) |
| 1104 | addPass(createAMDGPUAnnotateKernelFeaturesPass()); |
| 1105 | |
| 1106 | if (TM->getTargetTriple().getArch() == Triple::amdgcn && |
| 1107 | EnableLowerKernelArguments) |
| 1108 | addPass(createAMDGPULowerKernelArgumentsPass()); |
| 1109 | |
| 1110 | TargetPassConfig::addCodeGenPrepare(); |
| 1111 | |
| 1112 | if (isPassEnabled(EnableLoadStoreVectorizer)) |
| 1113 | addPass(createLoadStoreVectorizerPass()); |
| 1114 | |
| 1115 | // LowerSwitch pass may introduce unreachable blocks that can |
| 1116 | // cause unexpected behavior for subsequent passes. Placing it |
| 1117 | // here seems better that these blocks would get cleaned up by |
| 1118 | // UnreachableBlockElim inserted next in the pass flow. |
| 1119 | addPass(createLowerSwitchPass()); |
| 1120 | } |
| 1121 | |
| 1122 | bool AMDGPUPassConfig::addPreISel() { |
| 1123 | if (TM->getOptLevel() > CodeGenOpt::None) |
| 1124 | addPass(createFlattenCFGPass()); |
| 1125 | return false; |
| 1126 | } |
| 1127 | |
| 1128 | bool AMDGPUPassConfig::addInstSelector() { |
| 1129 | // Defer the verifier until FinalizeISel. |
| 1130 | addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false); |
| 1131 | return false; |
| 1132 | } |
| 1133 | |
| 1134 | bool AMDGPUPassConfig::addGCPasses() { |
| 1135 | // Do nothing. GC is not supported. |
| 1136 | return false; |
| 1137 | } |
| 1138 | |
| 1139 | //===----------------------------------------------------------------------===// |
| 1140 | // R600 Pass Setup |
| 1141 | //===----------------------------------------------------------------------===// |
| 1142 | |
| 1143 | bool R600PassConfig::addPreISel() { |
| 1144 | AMDGPUPassConfig::addPreISel(); |
| 1145 | |
| 1146 | if (EnableR600StructurizeCFG) |
| 1147 | addPass(createStructurizeCFGPass()); |
| 1148 | return false; |
| 1149 | } |
| 1150 | |
| 1151 | bool R600PassConfig::addInstSelector() { |
| 1152 | addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel())); |
| 1153 | return false; |
| 1154 | } |
| 1155 | |
| 1156 | void R600PassConfig::addPreRegAlloc() { |
| 1157 | addPass(createR600VectorRegMerger()); |
| 1158 | } |
| 1159 | |
| 1160 | void R600PassConfig::addPreSched2() { |
| 1161 | addPass(createR600EmitClauseMarkers(), false); |
| 1162 | if (EnableR600IfConvert) |
| 1163 | addPass(&IfConverterID, false); |
| 1164 | addPass(createR600ClauseMergePass(), false); |
| 1165 | } |
| 1166 | |
| 1167 | void R600PassConfig::addPreEmitPass() { |
| 1168 | addPass(createAMDGPUCFGStructurizerPass(), false); |
| 1169 | addPass(createR600ExpandSpecialInstrsPass(), false); |
| 1170 | addPass(&FinalizeMachineBundlesID, false); |
| 1171 | addPass(createR600Packetizer(), false); |
| 1172 | addPass(createR600ControlFlowFinalizer(), false); |
| 1173 | } |
| 1174 | |
| 1175 | TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { |
| 1176 | return new R600PassConfig(*this, PM); |
| 1177 | } |
| 1178 | |
| 1179 | //===----------------------------------------------------------------------===// |
| 1180 | // GCN Pass Setup |
| 1181 | //===----------------------------------------------------------------------===// |
| 1182 | |
| 1183 | ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( |
| 1184 | MachineSchedContext *C) const { |
| 1185 | const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); |
| 1186 | if (ST.enableSIScheduler()) |
| 1187 | return createSIMachineScheduler(C); |
| 1188 | return createGCNMaxOccupancyMachineScheduler(C); |
| 1189 | } |
| 1190 | |
| 1191 | bool GCNPassConfig::addPreISel() { |
| 1192 | AMDGPUPassConfig::addPreISel(); |
| 1193 | |
| 1194 | if (TM->getOptLevel() > CodeGenOpt::None) |
| 1195 | addPass(createAMDGPULateCodeGenPreparePass()); |
| 1196 | |
| 1197 | if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) { |
| 1198 | addPass(createAMDGPUAtomicOptimizerPass()); |
| 1199 | } |
| 1200 | |
| 1201 | if (TM->getOptLevel() > CodeGenOpt::None) |
| 1202 | addPass(createSinkingPass()); |
| 1203 | |
| 1204 | // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit |
| 1205 | // regions formed by them. |
| 1206 | addPass(&AMDGPUUnifyDivergentExitNodesID); |
| 1207 | if (!LateCFGStructurize) { |
| 1208 | if (EnableStructurizerWorkarounds) { |
| 1209 | addPass(createFixIrreduciblePass()); |
| 1210 | addPass(createUnifyLoopExitsPass()); |
| 1211 | } |
| 1212 | addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions |
| 1213 | } |
| 1214 | addPass(createAMDGPUAnnotateUniformValues()); |
| 1215 | if (!LateCFGStructurize) { |
| 1216 | addPass(createSIAnnotateControlFlowPass()); |
| 1217 | } |
| 1218 | addPass(createLCSSAPass()); |
| 1219 | |
| 1220 | if (TM->getOptLevel() > CodeGenOpt::Less) |
| 1221 | addPass(&AMDGPUPerfHintAnalysisID); |
| 1222 | |
| 1223 | return false; |
| 1224 | } |
| 1225 | |
| 1226 | void GCNPassConfig::addMachineSSAOptimization() { |
| 1227 | TargetPassConfig::addMachineSSAOptimization(); |
| 1228 | |
| 1229 | // We want to fold operands after PeepholeOptimizer has run (or as part of |
| 1230 | // it), because it will eliminate extra copies making it easier to fold the |
| 1231 | // real source operand. We want to eliminate dead instructions after, so that |
| 1232 | // we see fewer uses of the copies. We then need to clean up the dead |
| 1233 | // instructions leftover after the operands are folded as well. |
| 1234 | // |
| 1235 | // XXX - Can we get away without running DeadMachineInstructionElim again? |
| 1236 | addPass(&SIFoldOperandsID); |
| 1237 | if (EnableDPPCombine) |
| 1238 | addPass(&GCNDPPCombineID); |
| 1239 | addPass(&SILoadStoreOptimizerID); |
| 1240 | if (isPassEnabled(EnableSDWAPeephole)) { |
| 1241 | addPass(&SIPeepholeSDWAID); |
| 1242 | addPass(&EarlyMachineLICMID); |
| 1243 | addPass(&MachineCSEID); |
| 1244 | addPass(&SIFoldOperandsID); |
| 1245 | } |
| 1246 | addPass(&DeadMachineInstructionElimID); |
| 1247 | addPass(createSIShrinkInstructionsPass()); |
| 1248 | } |
| 1249 | |
| 1250 | bool GCNPassConfig::addILPOpts() { |
| 1251 | if (EnableEarlyIfConversion) |
| 1252 | addPass(&EarlyIfConverterID); |
| 1253 | |
| 1254 | TargetPassConfig::addILPOpts(); |
| 1255 | return false; |
| 1256 | } |
| 1257 | |
| 1258 | bool GCNPassConfig::addInstSelector() { |
| 1259 | AMDGPUPassConfig::addInstSelector(); |
| 1260 | addPass(&SIFixSGPRCopiesID); |
| 1261 | addPass(createSILowerI1CopiesPass()); |
| 1262 | return false; |
| 1263 | } |
| 1264 | |
| 1265 | bool GCNPassConfig::addIRTranslator() { |
| 1266 | addPass(new IRTranslator(getOptLevel())); |
| 1267 | return false; |
| 1268 | } |
| 1269 | |
| 1270 | void GCNPassConfig::addPreLegalizeMachineIR() { |
| 1271 | bool IsOptNone = getOptLevel() == CodeGenOpt::None; |
| 1272 | addPass(createAMDGPUPreLegalizeCombiner(IsOptNone)); |
| 1273 | addPass(new Localizer()); |
| 1274 | } |
| 1275 | |
| 1276 | bool GCNPassConfig::addLegalizeMachineIR() { |
| 1277 | addPass(new Legalizer()); |
| 1278 | return false; |
| 1279 | } |
| 1280 | |
| 1281 | void GCNPassConfig::addPreRegBankSelect() { |
| 1282 | bool IsOptNone = getOptLevel() == CodeGenOpt::None; |
| 1283 | addPass(createAMDGPUPostLegalizeCombiner(IsOptNone)); |
| 1284 | } |
| 1285 | |
| 1286 | bool GCNPassConfig::addRegBankSelect() { |
| 1287 | addPass(new RegBankSelect()); |
| 1288 | return false; |
| 1289 | } |
| 1290 | |
| 1291 | void GCNPassConfig::addPreGlobalInstructionSelect() { |
| 1292 | bool IsOptNone = getOptLevel() == CodeGenOpt::None; |
| 1293 | addPass(createAMDGPURegBankCombiner(IsOptNone)); |
| 1294 | } |
| 1295 | |
| 1296 | bool GCNPassConfig::addGlobalInstructionSelect() { |
| 1297 | addPass(new InstructionSelect(getOptLevel())); |
| 1298 | return false; |
| 1299 | } |
| 1300 | |
| 1301 | void GCNPassConfig::addPreRegAlloc() { |
| 1302 | if (LateCFGStructurize) { |
| 1303 | addPass(createAMDGPUMachineCFGStructurizerPass()); |
| 1304 | } |
| 1305 | } |
| 1306 | |
| 1307 | void GCNPassConfig::addFastRegAlloc() { |
| 1308 | // FIXME: We have to disable the verifier here because of PHIElimination + |
| 1309 | // TwoAddressInstructions disabling it. |
| 1310 | |
| 1311 | // This must be run immediately after phi elimination and before |
| 1312 | // TwoAddressInstructions, otherwise the processing of the tied operand of |
| 1313 | // SI_ELSE will introduce a copy of the tied operand source after the else. |
| 1314 | insertPass(&PHIEliminationID, &SILowerControlFlowID, false); |
| 1315 | |
| 1316 | insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID); |
| 1317 | insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID); |
| 1318 | |
| 1319 | TargetPassConfig::addFastRegAlloc(); |
| 1320 | } |
| 1321 | |
| 1322 | void GCNPassConfig::addOptimizedRegAlloc() { |
| 1323 | // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation |
| 1324 | // instructions that cause scheduling barriers. |
| 1325 | insertPass(&MachineSchedulerID, &SIWholeQuadModeID); |
| 1326 | insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID); |
| 1327 | |
| 1328 | if (OptExecMaskPreRA) |
| 1329 | insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); |
| 1330 | |
| 1331 | if (isPassEnabled(EnablePreRAOptimizations)) |
| 1332 | insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID); |
| 1333 | |
| 1334 | // This is not an essential optimization and it has a noticeable impact on |
| 1335 | // compilation time, so we only enable it from O2. |
| 1336 | if (TM->getOptLevel() > CodeGenOpt::Less) |
| 1337 | insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); |
| 1338 | |
| 1339 | // FIXME: when an instruction has a Killed operand, and the instruction is |
| 1340 | // inside a bundle, seems only the BUNDLE instruction appears as the Kills of |
| 1341 | // the register in LiveVariables, this would trigger a failure in verifier, |
| 1342 | // we should fix it and enable the verifier. |
| 1343 | if (OptVGPRLiveRange) |
| 1344 | insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID, false); |
| 1345 | // This must be run immediately after phi elimination and before |
| 1346 | // TwoAddressInstructions, otherwise the processing of the tied operand of |
| 1347 | // SI_ELSE will introduce a copy of the tied operand source after the else. |
| 1348 | insertPass(&PHIEliminationID, &SILowerControlFlowID, false); |
| 1349 | |
| 1350 | if (EnableDCEInRA) |
| 1351 | insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); |
| 1352 | |
| 1353 | TargetPassConfig::addOptimizedRegAlloc(); |
| 1354 | } |
| 1355 | |
| 1356 | bool GCNPassConfig::addPreRewrite() { |
| 1357 | if (EnableRegReassign) |
| 1358 | addPass(&GCNNSAReassignID); |
| 1359 | return true; |
| 1360 | } |
| 1361 | |
| 1362 | FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { |
| 1363 | // Initialize the global default. |
| 1364 | llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, |
| 1365 | initializeDefaultSGPRRegisterAllocatorOnce); |
| 1366 | |
| 1367 | RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); |
| 1368 | if (Ctor != useDefaultRegisterAllocator) |
| 1369 | return Ctor(); |
| 1370 | |
| 1371 | if (Optimized) |
| 1372 | return createGreedyRegisterAllocator(onlyAllocateSGPRs); |
| 1373 | |
| 1374 | return createFastRegisterAllocator(onlyAllocateSGPRs, false); |
| 1375 | } |
| 1376 | |
| 1377 | FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { |
| 1378 | // Initialize the global default. |
| 1379 | llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, |
| 1380 | initializeDefaultVGPRRegisterAllocatorOnce); |
| 1381 | |
| 1382 | RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); |
| 1383 | if (Ctor != useDefaultRegisterAllocator) |
| 1384 | return Ctor(); |
| 1385 | |
| 1386 | if (Optimized) |
| 1387 | return createGreedyVGPRRegisterAllocator(); |
| 1388 | |
| 1389 | return createFastVGPRRegisterAllocator(); |
| 1390 | } |
| 1391 | |
| 1392 | FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { |
| 1393 | llvm_unreachable("should not be used")__builtin_unreachable(); |
| 1394 | } |
| 1395 | |
| 1396 | static const char RegAllocOptNotSupportedMessage[] = |
| 1397 | "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc"; |
| 1398 | |
| 1399 | bool GCNPassConfig::addRegAssignAndRewriteFast() { |
| 1400 | if (!usingDefaultRegAlloc()) |
| 1401 | report_fatal_error(RegAllocOptNotSupportedMessage); |
| 1402 | |
| 1403 | addPass(createSGPRAllocPass(false)); |
| 1404 | |
| 1405 | // Equivalent of PEI for SGPRs. |
| 1406 | addPass(&SILowerSGPRSpillsID); |
| 1407 | |
| 1408 | addPass(createVGPRAllocPass(false)); |
| 1409 | return true; |
| 1410 | } |
| 1411 | |
| 1412 | bool GCNPassConfig::addRegAssignAndRewriteOptimized() { |
| 1413 | if (!usingDefaultRegAlloc()) |
| 1414 | report_fatal_error(RegAllocOptNotSupportedMessage); |
| 1415 | |
| 1416 | addPass(createSGPRAllocPass(true)); |
| 1417 | |
| 1418 | // Commit allocated register changes. This is mostly necessary because too |
| 1419 | // many things rely on the use lists of the physical registers, such as the |
| 1420 | // verifier. This is only necessary with allocators which use LiveIntervals, |
| 1421 | // since FastRegAlloc does the replacments itself. |
| 1422 | addPass(createVirtRegRewriter(false)); |
| 1423 | |
| 1424 | // Equivalent of PEI for SGPRs. |
| 1425 | addPass(&SILowerSGPRSpillsID); |
| 1426 | |
| 1427 | addPass(createVGPRAllocPass(true)); |
| 1428 | |
| 1429 | addPreRewrite(); |
| 1430 | addPass(&VirtRegRewriterID); |
| 1431 | |
| 1432 | return true; |
| 1433 | } |
| 1434 | |
| 1435 | void GCNPassConfig::addPostRegAlloc() { |
| 1436 | addPass(&SIFixVGPRCopiesID); |
| 1437 | if (getOptLevel() > CodeGenOpt::None) |
| 1438 | addPass(&SIOptimizeExecMaskingID); |
| 1439 | TargetPassConfig::addPostRegAlloc(); |
| 1440 | } |
| 1441 | |
| 1442 | void GCNPassConfig::addPreSched2() { |
| 1443 | addPass(&SIPostRABundlerID); |
| 1444 | } |
| 1445 | |
| 1446 | void GCNPassConfig::addPreEmitPass() { |
| 1447 | addPass(createSIMemoryLegalizerPass()); |
| 1448 | addPass(createSIInsertWaitcntsPass()); |
| 1449 | |
| 1450 | if (TM->getOptLevel() > CodeGenOpt::None) |
| 1451 | addPass(createSIShrinkInstructionsPass()); |
| 1452 | |
| 1453 | addPass(createSIModeRegisterPass()); |
| 1454 | |
| 1455 | if (getOptLevel() > CodeGenOpt::None) |
| 1456 | addPass(&SIInsertHardClausesID); |
| 1457 | |
| 1458 | addPass(&SILateBranchLoweringPassID); |
| 1459 | if (getOptLevel() > CodeGenOpt::None) |
| 1460 | addPass(&SIPreEmitPeepholeID); |
| 1461 | // The hazard recognizer that runs as part of the post-ra scheduler does not |
| 1462 | // guarantee to be able handle all hazards correctly. This is because if there |
| 1463 | // are multiple scheduling regions in a basic block, the regions are scheduled |
| 1464 | // bottom up, so when we begin to schedule a region we don't know what |
| 1465 | // instructions were emitted directly before it. |
| 1466 | // |
| 1467 | // Here we add a stand-alone hazard recognizer pass which can handle all |
| 1468 | // cases. |
| 1469 | addPass(&PostRAHazardRecognizerID); |
| 1470 | addPass(&BranchRelaxationPassID); |
| 1471 | } |
| 1472 | |
| 1473 | TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { |
| 1474 | return new GCNPassConfig(*this, PM); |
| 1475 | } |
| 1476 | |
| 1477 | yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { |
| 1478 | return new yaml::SIMachineFunctionInfo(); |
| 1479 | } |
| 1480 | |
| 1481 | yaml::MachineFunctionInfo * |
| 1482 | GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { |
| 1483 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| 1484 | return new yaml::SIMachineFunctionInfo( |
| 1485 | *MFI, *MF.getSubtarget().getRegisterInfo(), MF); |
| 1486 | } |
| 1487 | |
| 1488 | bool GCNTargetMachine::parseMachineFunctionInfo( |
| 1489 | const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, |
| 1490 | SMDiagnostic &Error, SMRange &SourceRange) const { |
| 1491 | const yaml::SIMachineFunctionInfo &YamlMFI = |
| 1492 | reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_); |
| 1493 | MachineFunction &MF = PFS.MF; |
| 1494 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| 1495 | |
| 1496 | if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange)) |
| 1497 | return true; |
| 1498 | |
| 1499 | if (MFI->Occupancy == 0) { |
| 1500 | // Fixup the subtarget dependent default value. |
| 1501 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| 1502 | MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize()); |
| 1503 | } |
| 1504 | |
| 1505 | auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { |
| 1506 | Register TempReg; |
| 1507 | if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) { |
| 1508 | SourceRange = RegName.SourceRange; |
| 1509 | return true; |
| 1510 | } |
| 1511 | RegVal = TempReg; |
| 1512 | |
| 1513 | return false; |
| 1514 | }; |
| 1515 | |
| 1516 | auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { |
| 1517 | // Create a diagnostic for a the register string literal. |
| 1518 | const MemoryBuffer &Buffer = |
| 1519 | *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); |
| 1520 | Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, |
| 1521 | RegName.Value.size(), SourceMgr::DK_Error, |
| 1522 | "incorrect register class for field", RegName.Value, |
| 1523 | None, None); |
| 1524 | SourceRange = RegName.SourceRange; |
| 1525 | return true; |
| 1526 | }; |
| 1527 | |
| 1528 | if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || |
| 1529 | parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || |
| 1530 | parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) |
| 1531 | return true; |
| 1532 | |
| 1533 | if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && |
| 1534 | !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) { |
| 1535 | return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); |
| 1536 | } |
| 1537 | |
| 1538 | if (MFI->FrameOffsetReg != AMDGPU::FP_REG && |
| 1539 | !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { |
| 1540 | return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); |
| 1541 | } |
| 1542 | |
| 1543 | if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && |
| 1544 | !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) { |
| 1545 | return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); |
| 1546 | } |
| 1547 | |
| 1548 | auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A, |
| 1549 | const TargetRegisterClass &RC, |
| 1550 | ArgDescriptor &Arg, unsigned UserSGPRs, |
| 1551 | unsigned SystemSGPRs) { |
| 1552 | // Skip parsing if it's not present. |
| 1553 | if (!A) |
| 1554 | return false; |
| 1555 | |
| 1556 | if (A->IsRegister) { |
| 1557 | Register Reg; |
| 1558 | if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { |
| 1559 | SourceRange = A->RegisterName.SourceRange; |
| 1560 | return true; |
| 1561 | } |
| 1562 | if (!RC.contains(Reg)) |
| 1563 | return diagnoseRegisterClass(A->RegisterName); |
| 1564 | Arg = ArgDescriptor::createRegister(Reg); |
| 1565 | } else |
| 1566 | Arg = ArgDescriptor::createStack(A->StackOffset); |
| 1567 | // Check and apply the optional mask. |
| 1568 | if (A->Mask) |
| 1569 | Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue()); |
| 1570 | |
| 1571 | MFI->NumUserSGPRs += UserSGPRs; |
| 1572 | MFI->NumSystemSGPRs += SystemSGPRs; |
| 1573 | return false; |
| 1574 | }; |
| 1575 | |
| 1576 | if (YamlMFI.ArgInfo && |
| 1577 | (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, |
| 1578 | AMDGPU::SGPR_128RegClass, |
| 1579 | MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || |
| 1580 | parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, |
| 1581 | AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, |
| 1582 | 2, 0) || |
| 1583 | parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, |
| 1584 | MFI->ArgInfo.QueuePtr, 2, 0) || |
| 1585 | parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, |
| 1586 | AMDGPU::SReg_64RegClass, |
| 1587 | MFI->ArgInfo.KernargSegmentPtr, 2, 0) || |
| 1588 | parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, |
| 1589 | AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, |
| 1590 | 2, 0) || |
| 1591 | parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, |
| 1592 | AMDGPU::SReg_64RegClass, |
| 1593 | MFI->ArgInfo.FlatScratchInit, 2, 0) || |
| 1594 | parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, |
| 1595 | AMDGPU::SGPR_32RegClass, |
| 1596 | MFI->ArgInfo.PrivateSegmentSize, 0, 0) || |
| 1597 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, |
| 1598 | AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, |
| 1599 | 0, 1) || |
| 1600 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, |
| 1601 | AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, |
| 1602 | 0, 1) || |
| 1603 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, |
| 1604 | AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, |
| 1605 | 0, 1) || |
| 1606 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, |
| 1607 | AMDGPU::SGPR_32RegClass, |
| 1608 | MFI->ArgInfo.WorkGroupInfo, 0, 1) || |
| 1609 | parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, |
| 1610 | AMDGPU::SGPR_32RegClass, |
| 1611 | MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || |
| 1612 | parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, |
| 1613 | AMDGPU::SReg_64RegClass, |
| 1614 | MFI->ArgInfo.ImplicitArgPtr, 0, 0) || |
| 1615 | parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, |
| 1616 | AMDGPU::SReg_64RegClass, |
| 1617 | MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || |
| 1618 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, |
| 1619 | AMDGPU::VGPR_32RegClass, |
| 1620 | MFI->ArgInfo.WorkItemIDX, 0, 0) || |
| 1621 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, |
| 1622 | AMDGPU::VGPR_32RegClass, |
| 1623 | MFI->ArgInfo.WorkItemIDY, 0, 0) || |
| 1624 | parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, |
| 1625 | AMDGPU::VGPR_32RegClass, |
| 1626 | MFI->ArgInfo.WorkItemIDZ, 0, 0))) |
| 1627 | return true; |
| 1628 | |
| 1629 | MFI->Mode.IEEE = YamlMFI.Mode.IEEE; |
| 1630 | MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; |
| 1631 | MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals; |
| 1632 | MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals; |
| 1633 | MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals; |
| 1634 | MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals; |
| 1635 | |
| 1636 | return false; |
| 1637 | } |