Bug Summary

File:src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/CodeGen/BasicTTIImpl.h
Warning:line 1065, column 11
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple amd64-unknown-openbsd7.0 -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUTargetTransformInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model static -mframe-pointer=all -relaxed-aliasing -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -resource-dir /usr/local/lib/clang/13.0.0 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Analysis -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ASMParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/BinaryFormat -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitstream -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /include/llvm/CodeGen -I /include/llvm/CodeGen/PBQP -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Coroutines -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData/Coverage -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/CodeView -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/DWARF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/MSF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/PDB -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Demangle -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/JITLink -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/Orc -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenACC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenMP -I /include/llvm/CodeGen/GlobalISel -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IRReader -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/LTO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Linker -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC/MCParser -I /include/llvm/CodeGen/MIRParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Object -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Option -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Passes -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Scalar -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ADT -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Support -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/Symbolize -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Target -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Utils -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Vectorize -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/IPO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include -I /usr/src/gnu/usr.bin/clang/libLLVM/../include -I /usr/src/gnu/usr.bin/clang/libLLVM/obj -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include -D NDEBUG -D __STDC_LIMIT_MACROS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D LLVM_PREFIX="/usr" -internal-isystem /usr/include/c++/v1 -internal-isystem /usr/local/lib/clang/13.0.0/include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -ferror-limit 19 -fvisibility-inlines-hidden -fwrapv -stack-protector 2 -fno-rtti -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-valloc -fno-builtin-free -fno-builtin-strdup -fno-builtin-strndup -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/ben/Projects/vmm/scan-build/2022-01-12-194120-40624-1 -x c++ /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

/usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUTargetTransformInfo.h"
18#include "AMDGPUTargetMachine.h"
19#include "llvm/Analysis/LoopInfo.h"
20#include "llvm/Analysis/ValueTracking.h"
21#include "llvm/IR/IntrinsicsAMDGPU.h"
22#include "llvm/IR/IRBuilder.h"
23#include "llvm/IR/PatternMatch.h"
24#include "llvm/Support/KnownBits.h"
25
26using namespace llvm;
27
28#define DEBUG_TYPE"AMDGPUtti" "AMDGPUtti"
29
30static cl::opt<unsigned> UnrollThresholdPrivate(
31 "amdgpu-unroll-threshold-private",
32 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
33 cl::init(2700), cl::Hidden);
34
35static cl::opt<unsigned> UnrollThresholdLocal(
36 "amdgpu-unroll-threshold-local",
37 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
38 cl::init(1000), cl::Hidden);
39
40static cl::opt<unsigned> UnrollThresholdIf(
41 "amdgpu-unroll-threshold-if",
42 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
43 cl::init(200), cl::Hidden);
44
45static cl::opt<bool> UnrollRuntimeLocal(
46 "amdgpu-unroll-runtime-local",
47 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
48 cl::init(true), cl::Hidden);
49
50static cl::opt<bool> UseLegacyDA(
51 "amdgpu-use-legacy-divergence-analysis",
52 cl::desc("Enable legacy divergence analysis for AMDGPU"),
53 cl::init(false), cl::Hidden);
54
55static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
56 "amdgpu-unroll-max-block-to-analyze",
57 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
58 cl::init(32), cl::Hidden);
59
60static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
61 cl::Hidden, cl::init(4000),
62 cl::desc("Cost of alloca argument"));
63
64// If the amount of scratch memory to eliminate exceeds our ability to allocate
65// it into registers we gain nothing by aggressively inlining functions for that
66// heuristic.
67static cl::opt<unsigned>
68 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
69 cl::init(256),
70 cl::desc("Maximum alloca size to use for inline cost"));
71
72// Inliner constraint to achieve reasonable compilation time.
73static cl::opt<size_t> InlineMaxBB(
74 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
75 cl::desc("Maximum number of BBs allowed in a function after inlining"
76 " (compile time constraint)"));
77
78static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
79 unsigned Depth = 0) {
80 const Instruction *I = dyn_cast<Instruction>(Cond);
81 if (!I)
82 return false;
83
84 for (const Value *V : I->operand_values()) {
85 if (!L->contains(I))
86 continue;
87 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
88 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
89 return SubLoop->contains(PHI); }))
90 return true;
91 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
92 return true;
93 }
94 return false;
95}
96
97AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
98 : BaseT(TM, F.getParent()->getDataLayout()),
99 TargetTriple(TM->getTargetTriple()),
100 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
101 TLI(ST->getTargetLowering()) {}
102
103void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
104 TTI::UnrollingPreferences &UP) {
105 const Function &F = *L->getHeader()->getParent();
106 UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
107 UP.MaxCount = std::numeric_limits<unsigned>::max();
108 UP.Partial = true;
109
110 // Conditional branch in a loop back edge needs 3 additional exec
111 // manipulations in average.
112 UP.BEInsns += 3;
113
114 // TODO: Do we want runtime unrolling?
115
116 // Maximum alloca size than can fit registers. Reserve 16 registers.
117 const unsigned MaxAlloca = (256 - 16) * 4;
118 unsigned ThresholdPrivate = UnrollThresholdPrivate;
119 unsigned ThresholdLocal = UnrollThresholdLocal;
120
121 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
122 // provided threshold value as the default for Threshold
123 if (MDNode *LoopUnrollThreshold =
124 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
125 if (LoopUnrollThreshold->getNumOperands() == 2) {
126 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
127 LoopUnrollThreshold->getOperand(1));
128 if (MetaThresholdValue) {
129 // We will also use the supplied value for PartialThreshold for now.
130 // We may introduce additional metadata if it becomes necessary in the
131 // future.
132 UP.Threshold = MetaThresholdValue->getSExtValue();
133 UP.PartialThreshold = UP.Threshold;
134 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
135 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
136 }
137 }
138 }
139
140 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
141 for (const BasicBlock *BB : L->getBlocks()) {
142 const DataLayout &DL = BB->getModule()->getDataLayout();
143 unsigned LocalGEPsSeen = 0;
144
145 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
146 return SubLoop->contains(BB); }))
147 continue; // Block belongs to an inner loop.
148
149 for (const Instruction &I : *BB) {
150 // Unroll a loop which contains an "if" statement whose condition
151 // defined by a PHI belonging to the loop. This may help to eliminate
152 // if region and potentially even PHI itself, saving on both divergence
153 // and registers used for the PHI.
154 // Add a small bonus for each of such "if" statements.
155 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
156 if (UP.Threshold < MaxBoost && Br->isConditional()) {
157 BasicBlock *Succ0 = Br->getSuccessor(0);
158 BasicBlock *Succ1 = Br->getSuccessor(1);
159 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
160 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
161 continue;
162 if (dependsOnLocalPhi(L, Br->getCondition())) {
163 UP.Threshold += UnrollThresholdIf;
164 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Thresholddo { } while (false)
165 << " for loop:\n"do { } while (false)
166 << *L << " due to " << *Br << '\n')do { } while (false);
167 if (UP.Threshold >= MaxBoost)
168 return;
169 }
170 }
171 continue;
172 }
173
174 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
175 if (!GEP)
176 continue;
177
178 unsigned AS = GEP->getAddressSpace();
179 unsigned Threshold = 0;
180 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
181 Threshold = ThresholdPrivate;
182 else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
183 Threshold = ThresholdLocal;
184 else
185 continue;
186
187 if (UP.Threshold >= Threshold)
188 continue;
189
190 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
191 const Value *Ptr = GEP->getPointerOperand();
192 const AllocaInst *Alloca =
193 dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
194 if (!Alloca || !Alloca->isStaticAlloca())
195 continue;
196 Type *Ty = Alloca->getAllocatedType();
197 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
198 if (AllocaSize > MaxAlloca)
199 continue;
200 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
201 AS == AMDGPUAS::REGION_ADDRESS) {
202 LocalGEPsSeen++;
203 // Inhibit unroll for local memory if we have seen addressing not to
204 // a variable, most likely we will be unable to combine it.
205 // Do not unroll too deep inner loops for local memory to give a chance
206 // to unroll an outer loop for a more important reason.
207 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
208 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
209 !isa<Argument>(GEP->getPointerOperand())))
210 continue;
211 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"do { } while (false)
212 << *L << " due to LDS use.\n")do { } while (false);
213 UP.Runtime = UnrollRuntimeLocal;
214 }
215
216 // Check if GEP depends on a value defined by this loop itself.
217 bool HasLoopDef = false;
218 for (const Value *Op : GEP->operands()) {
219 const Instruction *Inst = dyn_cast<Instruction>(Op);
220 if (!Inst || L->isLoopInvariant(Op))
221 continue;
222
223 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
224 return SubLoop->contains(Inst); }))
225 continue;
226 HasLoopDef = true;
227 break;
228 }
229 if (!HasLoopDef)
230 continue;
231
232 // We want to do whatever we can to limit the number of alloca
233 // instructions that make it through to the code generator. allocas
234 // require us to use indirect addressing, which is slow and prone to
235 // compiler bugs. If this loop does an address calculation on an
236 // alloca ptr, then we want to use a higher than normal loop unroll
237 // threshold. This will give SROA a better chance to eliminate these
238 // allocas.
239 //
240 // We also want to have more unrolling for local memory to let ds
241 // instructions with different offsets combine.
242 //
243 // Don't use the maximum allowed value here as it will make some
244 // programs way too big.
245 UP.Threshold = Threshold;
246 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Thresholddo { } while (false)
247 << " for loop:\n"do { } while (false)
248 << *L << " due to " << *GEP << '\n')do { } while (false);
249 if (UP.Threshold >= MaxBoost)
250 return;
251 }
252
253 // If we got a GEP in a small BB from inner loop then increase max trip
254 // count to analyze for better estimation cost in unroll
255 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
256 UP.MaxIterationsCountToAnalyze = 32;
257 }
258}
259
260void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
261 TTI::PeelingPreferences &PP) {
262 BaseT::getPeelingPreferences(L, SE, PP);
263}
264
265const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
266 // Codegen control options which don't matter.
267 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
268 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
269 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
270 AMDGPU::FeatureUnalignedAccessMode,
271
272 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
273
274 // Property of the kernel/environment which can't actually differ.
275 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
276 AMDGPU::FeatureTrapHandler,
277
278 // The default assumption needs to be ecc is enabled, but no directly
279 // exposed operations depend on it, so it can be safely inlined.
280 AMDGPU::FeatureSRAMECC,
281
282 // Perf-tuning features
283 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
284
285GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
286 : BaseT(TM, F.getParent()->getDataLayout()),
287 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
288 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
289 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())),
290 MaxVGPRs(ST->getMaxNumVGPRs(
291 std::max(ST->getWavesPerEU(F).first,
292 ST->getWavesPerEUForWorkGroup(
293 ST->getFlatWorkGroupSizes(F).second)))) {
294 AMDGPU::SIModeRegisterDefaults Mode(F);
295 HasFP32Denormals = Mode.allFP32Denormals();
296 HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
297}
298
299unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
300 // The concept of vector registers doesn't really exist. Some packed vector
301 // operations operate on the normal 32-bit registers.
302 return MaxVGPRs;
303}
304
305unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
306 // This is really the number of registers to fill when vectorizing /
307 // interleaving loops, so we lie to avoid trying to use all registers.
308 return getHardwareNumberOfRegisters(Vec) >> 3;
309}
310
311unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
312 const SIRegisterInfo *TRI = ST->getRegisterInfo();
313 const TargetRegisterClass *RC = TRI->getRegClass(RCID);
314 unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32;
315 return getHardwareNumberOfRegisters(false) / NumVGPRs;
316}
317
318TypeSize
319GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
320 switch (K) {
321 case TargetTransformInfo::RGK_Scalar:
322 return TypeSize::getFixed(32);
323 case TargetTransformInfo::RGK_FixedWidthVector:
324 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
325 case TargetTransformInfo::RGK_ScalableVector:
326 return TypeSize::getScalable(0);
327 }
328 llvm_unreachable("Unsupported register kind")__builtin_unreachable();
329}
330
331unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
332 return 32;
333}
334
335unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
336 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
337 return 32 * 4 / ElemWidth;
338 return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
339 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
340 : 1;
341}
342
343unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
344 unsigned ChainSizeInBytes,
345 VectorType *VecTy) const {
346 unsigned VecRegBitWidth = VF * LoadSize;
347 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
348 // TODO: Support element-size less than 32bit?
349 return 128 / LoadSize;
350
351 return VF;
352}
353
354unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
355 unsigned ChainSizeInBytes,
356 VectorType *VecTy) const {
357 unsigned VecRegBitWidth = VF * StoreSize;
358 if (VecRegBitWidth > 128)
359 return 128 / StoreSize;
360
361 return VF;
362}
363
364unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
365 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
366 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
367 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
368 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
369 return 512;
370 }
371
372 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
373 return 8 * ST->getMaxPrivateElementSize();
374
375 // Common to flat, global, local and region. Assume for unknown addrspace.
376 return 128;
377}
378
379bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
380 Align Alignment,
381 unsigned AddrSpace) const {
382 // We allow vectorization of flat stores, even though we may need to decompose
383 // them later if they may access private memory. We don't have enough context
384 // here, and legalization can handle it.
385 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
386 return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
387 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
388 }
389 return true;
390}
391
392bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
393 Align Alignment,
394 unsigned AddrSpace) const {
395 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
396}
397
398bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
399 Align Alignment,
400 unsigned AddrSpace) const {
401 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
402}
403
404// FIXME: Really we would like to issue multiple 128-bit loads and stores per
405// iteration. Should we report a larger size and let it legalize?
406//
407// FIXME: Should we use narrower types for local/region, or account for when
408// unaligned access is legal?
409//
410// FIXME: This could use fine tuning and microbenchmarks.
411Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
412 unsigned SrcAddrSpace,
413 unsigned DestAddrSpace,
414 unsigned SrcAlign,
415 unsigned DestAlign) const {
416 unsigned MinAlign = std::min(SrcAlign, DestAlign);
417
418 // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
419 // hardware into byte accesses. If you assume all alignments are equally
420 // probable, it's more efficient on average to use short accesses for this
421 // case.
422 if (MinAlign == 2)
423 return Type::getInt16Ty(Context);
424
425 // Not all subtargets have 128-bit DS instructions, and we currently don't
426 // form them by default.
427 if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
428 SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
429 DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
430 DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
431 return FixedVectorType::get(Type::getInt32Ty(Context), 2);
432 }
433
434 // Global memory works best with 16-byte accesses. Private memory will also
435 // hit this, although they'll be decomposed.
436 return FixedVectorType::get(Type::getInt32Ty(Context), 4);
437}
438
439void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
440 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
441 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
442 unsigned SrcAlign, unsigned DestAlign) const {
443 assert(RemainingBytes < 16)((void)0);
444
445 unsigned MinAlign = std::min(SrcAlign, DestAlign);
446
447 if (MinAlign != 2) {
448 Type *I64Ty = Type::getInt64Ty(Context);
449 while (RemainingBytes >= 8) {
450 OpsOut.push_back(I64Ty);
451 RemainingBytes -= 8;
452 }
453
454 Type *I32Ty = Type::getInt32Ty(Context);
455 while (RemainingBytes >= 4) {
456 OpsOut.push_back(I32Ty);
457 RemainingBytes -= 4;
458 }
459 }
460
461 Type *I16Ty = Type::getInt16Ty(Context);
462 while (RemainingBytes >= 2) {
463 OpsOut.push_back(I16Ty);
464 RemainingBytes -= 2;
465 }
466
467 Type *I8Ty = Type::getInt8Ty(Context);
468 while (RemainingBytes) {
469 OpsOut.push_back(I8Ty);
470 --RemainingBytes;
471 }
472}
473
474unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
475 // Disable unrolling if the loop is not vectorized.
476 // TODO: Enable this again.
477 if (VF == 1)
478 return 1;
479
480 return 8;
481}
482
483bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
484 MemIntrinsicInfo &Info) const {
485 switch (Inst->getIntrinsicID()) {
486 case Intrinsic::amdgcn_atomic_inc:
487 case Intrinsic::amdgcn_atomic_dec:
488 case Intrinsic::amdgcn_ds_ordered_add:
489 case Intrinsic::amdgcn_ds_ordered_swap:
490 case Intrinsic::amdgcn_ds_fadd:
491 case Intrinsic::amdgcn_ds_fmin:
492 case Intrinsic::amdgcn_ds_fmax: {
493 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
494 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
495 if (!Ordering || !Volatile)
496 return false; // Invalid.
497
498 unsigned OrderingVal = Ordering->getZExtValue();
499 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
500 return false;
501
502 Info.PtrVal = Inst->getArgOperand(0);
503 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
504 Info.ReadMem = true;
505 Info.WriteMem = true;
506 Info.IsVolatile = !Volatile->isNullValue();
507 return true;
508 }
509 default:
510 return false;
511 }
512}
513
514InstructionCost GCNTTIImpl::getArithmeticInstrCost(
515 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
516 TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
517 TTI::OperandValueProperties Opd1PropInfo,
518 TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
519 const Instruction *CxtI) {
520 EVT OrigTy = TLI->getValueType(DL, Ty);
521 if (!OrigTy.isSimple()) {
522 // FIXME: We're having to query the throughput cost so that the basic
523 // implementation tries to generate legalize and scalarization costs. Maybe
524 // we could hoist the scalarization code here?
525 if (CostKind != TTI::TCK_CodeSize)
526 return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
527 Opd1Info, Opd2Info, Opd1PropInfo,
528 Opd2PropInfo, Args, CxtI);
529 // Scalarization
530
531 // Check if any of the operands are vector operands.
532 int ISD = TLI->InstructionOpcodeToISD(Opcode);
533 assert(ISD && "Invalid opcode")((void)0);
534
535 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
536
537 bool IsFloat = Ty->isFPOrFPVectorTy();
538 // Assume that floating point arithmetic operations cost twice as much as
539 // integer operations.
540 unsigned OpCost = (IsFloat ? 2 : 1);
541
542 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
543 // The operation is legal. Assume it costs 1.
544 // TODO: Once we have extract/insert subvector cost we need to use them.
545 return LT.first * OpCost;
546 }
547
548 if (!TLI->isOperationExpand(ISD, LT.second)) {
549 // If the operation is custom lowered, then assume that the code is twice
550 // as expensive.
551 return LT.first * 2 * OpCost;
552 }
553
554 // Else, assume that we need to scalarize this op.
555 // TODO: If one of the types get legalized by splitting, handle this
556 // similarly to what getCastInstrCost() does.
557 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
558 unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
559 InstructionCost Cost = getArithmeticInstrCost(
560 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
561 Opd1PropInfo, Opd2PropInfo, Args, CxtI);
562 // Return the cost of multiple scalar invocation plus the cost of
563 // inserting and extracting the values.
564 SmallVector<Type *> Tys(Args.size(), Ty);
565 return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
566 }
567
568 // We don't know anything about this scalar instruction.
569 return OpCost;
570 }
571
572 // Legalize the type.
573 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
574 int ISD = TLI->InstructionOpcodeToISD(Opcode);
575
576 // Because we don't have any legal vector operations, but the legal types, we
577 // need to account for split vectors.
578 unsigned NElts = LT.second.isVector() ?
579 LT.second.getVectorNumElements() : 1;
580
581 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
582
583 switch (ISD) {
584 case ISD::SHL:
585 case ISD::SRL:
586 case ISD::SRA:
587 if (SLT == MVT::i64)
588 return get64BitInstrCost(CostKind) * LT.first * NElts;
589
590 if (ST->has16BitInsts() && SLT == MVT::i16)
591 NElts = (NElts + 1) / 2;
592
593 // i32
594 return getFullRateInstrCost() * LT.first * NElts;
595 case ISD::ADD:
596 case ISD::SUB:
597 case ISD::AND:
598 case ISD::OR:
599 case ISD::XOR:
600 if (SLT == MVT::i64) {
601 // and, or and xor are typically split into 2 VALU instructions.
602 return 2 * getFullRateInstrCost() * LT.first * NElts;
603 }
604
605 if (ST->has16BitInsts() && SLT == MVT::i16)
606 NElts = (NElts + 1) / 2;
607
608 return LT.first * NElts * getFullRateInstrCost();
609 case ISD::MUL: {
610 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
611 if (SLT == MVT::i64) {
612 const int FullRateCost = getFullRateInstrCost();
613 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
614 }
615
616 if (ST->has16BitInsts() && SLT == MVT::i16)
617 NElts = (NElts + 1) / 2;
618
619 // i32
620 return QuarterRateCost * NElts * LT.first;
621 }
622 case ISD::FMUL:
623 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
624 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
625 // fused operation.
626 if (CxtI && CxtI->hasOneUse())
627 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
628 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
629 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
630 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
631 return TargetTransformInfo::TCC_Free;
632 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
633 return TargetTransformInfo::TCC_Free;
634
635 // Estimate all types may be fused with contract/unsafe flags
636 const TargetOptions &Options = TLI->getTargetMachine().Options;
637 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
638 Options.UnsafeFPMath ||
639 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
640 return TargetTransformInfo::TCC_Free;
641 }
642 }
643 LLVM_FALLTHROUGH[[gnu::fallthrough]];
644 case ISD::FADD:
645 case ISD::FSUB:
646 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
647 NElts = (NElts + 1) / 2;
648 if (SLT == MVT::f64)
649 return LT.first * NElts * get64BitInstrCost(CostKind);
650
651 if (ST->has16BitInsts() && SLT == MVT::f16)
652 NElts = (NElts + 1) / 2;
653
654 if (SLT == MVT::f32 || SLT == MVT::f16)
655 return LT.first * NElts * getFullRateInstrCost();
656 break;
657 case ISD::FDIV:
658 case ISD::FREM:
659 // FIXME: frem should be handled separately. The fdiv in it is most of it,
660 // but the current lowering is also not entirely correct.
661 if (SLT == MVT::f64) {
662 int Cost = 7 * get64BitInstrCost(CostKind) +
663 getQuarterRateInstrCost(CostKind) +
664 3 * getHalfRateInstrCost(CostKind);
665 // Add cost of workaround.
666 if (!ST->hasUsableDivScaleConditionOutput())
667 Cost += 3 * getFullRateInstrCost();
668
669 return LT.first * Cost * NElts;
670 }
671
672 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
673 // TODO: This is more complicated, unsafe flags etc.
674 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
675 (SLT == MVT::f16 && ST->has16BitInsts())) {
676 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
677 }
678 }
679
680 if (SLT == MVT::f16 && ST->has16BitInsts()) {
681 // 2 x v_cvt_f32_f16
682 // f32 rcp
683 // f32 fmul
684 // v_cvt_f16_f32
685 // f16 div_fixup
686 int Cost =
687 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
688 return LT.first * Cost * NElts;
689 }
690
691 if (SLT == MVT::f32 || SLT == MVT::f16) {
692 // 4 more v_cvt_* insts without f16 insts support
693 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
694 1 * getQuarterRateInstrCost(CostKind);
695
696 if (!HasFP32Denormals) {
697 // FP mode switches.
698 Cost += 2 * getFullRateInstrCost();
699 }
700
701 return LT.first * NElts * Cost;
702 }
703 break;
704 case ISD::FNEG:
705 // Use the backend' estimation. If fneg is not free each element will cost
706 // one additional instruction.
707 return TLI->isFNegFree(SLT) ? 0 : NElts;
708 default:
709 break;
710 }
711
712 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
713 Opd1PropInfo, Opd2PropInfo, Args, CxtI);
714}
715
716// Return true if there's a potential benefit from using v2f16/v2i16
717// instructions for an intrinsic, even if it requires nontrivial legalization.
718static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
719 switch (ID) {
720 case Intrinsic::fma: // TODO: fmuladd
721 // There's a small benefit to using vector ops in the legalized code.
722 case Intrinsic::round:
723 case Intrinsic::uadd_sat:
724 case Intrinsic::usub_sat:
725 case Intrinsic::sadd_sat:
726 case Intrinsic::ssub_sat:
727 return true;
728 default:
729 return false;
730 }
731}
732
733InstructionCost
734GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
735 TTI::TargetCostKind CostKind) {
736 if (ICA.getID() == Intrinsic::fabs)
737 return 0;
738
739 if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
740 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
741
742 Type *RetTy = ICA.getReturnType();
743 EVT OrigTy = TLI->getValueType(DL, RetTy);
744 if (!OrigTy.isSimple()) {
745 if (CostKind != TTI::TCK_CodeSize)
746 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
747
748 // TODO: Combine these two logic paths.
749 if (ICA.isTypeBasedOnly())
750 return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
751
752 unsigned RetVF =
753 (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
754 : 1);
755 const IntrinsicInst *I = ICA.getInst();
756 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
757 FastMathFlags FMF = ICA.getFlags();
758 // Assume that we need to scalarize this intrinsic.
759
760 // Compute the scalarization overhead based on Args for a vector
761 // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
762 // CostModel will pass a vector RetTy and VF is 1.
763 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
764 if (RetVF > 1) {
765 ScalarizationCost = 0;
766 if (!RetTy->isVoidTy())
767 ScalarizationCost +=
768 getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
769 ScalarizationCost +=
770 getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
771 }
772
773 IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I,
774 ScalarizationCost);
775 return getIntrinsicInstrCost(Attrs, CostKind);
776 }
777
778 // Legalize the type.
779 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
780
781 unsigned NElts = LT.second.isVector() ?
782 LT.second.getVectorNumElements() : 1;
783
784 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
785
786 if (SLT == MVT::f64)
787 return LT.first * NElts * get64BitInstrCost(CostKind);
788
789 if ((ST->has16BitInsts() && SLT == MVT::f16) ||
790 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
791 NElts = (NElts + 1) / 2;
792
793 // TODO: Get more refined intrinsic costs?
794 unsigned InstRate = getQuarterRateInstrCost(CostKind);
795
796 switch (ICA.getID()) {
797 case Intrinsic::fma:
798 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
799 : getQuarterRateInstrCost(CostKind);
800 break;
801 case Intrinsic::uadd_sat:
802 case Intrinsic::usub_sat:
803 case Intrinsic::sadd_sat:
804 case Intrinsic::ssub_sat:
805 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
806 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
807 NElts = 1;
808 break;
809 }
810
811 return LT.first * NElts * InstRate;
812}
813
814InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
815 TTI::TargetCostKind CostKind,
816 const Instruction *I) {
817 assert((I == nullptr || I->getOpcode() == Opcode) &&((void)0)
818 "Opcode should reflect passed instruction.")((void)0);
819 const bool SCost =
820 (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
821 const int CBrCost = SCost ? 5 : 7;
822 switch (Opcode) {
823 case Instruction::Br: {
824 // Branch instruction takes about 4 slots on gfx900.
825 auto BI = dyn_cast_or_null<BranchInst>(I);
826 if (BI && BI->isUnconditional())
827 return SCost ? 1 : 4;
828 // Suppose conditional branch takes additional 3 exec manipulations
829 // instructions in average.
830 return CBrCost;
831 }
832 case Instruction::Switch: {
833 auto SI = dyn_cast_or_null<SwitchInst>(I);
834 // Each case (including default) takes 1 cmp + 1 cbr instructions in
835 // average.
836 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
837 }
838 case Instruction::Ret:
839 return SCost ? 1 : 10;
840 }
841 return BaseT::getCFInstrCost(Opcode, CostKind, I);
842}
843
844InstructionCost
845GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
846 Optional<FastMathFlags> FMF,
847 TTI::TargetCostKind CostKind) {
848 if (TTI::requiresOrderedReduction(FMF))
849 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
850
851 EVT OrigTy = TLI->getValueType(DL, Ty);
852
853 // Computes cost on targets that have packed math instructions(which support
854 // 16-bit types only).
855 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
856 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
857
858 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
859 return LT.first * getFullRateInstrCost();
860}
861
862InstructionCost
863GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
864 bool IsUnsigned,
865 TTI::TargetCostKind CostKind) {
866 EVT OrigTy = TLI->getValueType(DL, Ty);
867
868 // Computes cost on targets that have packed math instructions(which support
869 // 16-bit types only).
870 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
1
Assuming the condition is false
2
Assuming the condition is true
3
Taking true branch
871 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
4
Calling 'BasicTTIImplBase::getMinMaxReductionCost'
872
873 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
874 return LT.first * getHalfRateInstrCost(CostKind);
875}
876
877InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
878 unsigned Index) {
879 switch (Opcode) {
880 case Instruction::ExtractElement:
881 case Instruction::InsertElement: {
882 unsigned EltSize
883 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
884 if (EltSize < 32) {
885 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
886 return 0;
887 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
888 }
889
890 // Extracts are just reads of a subregister, so are free. Inserts are
891 // considered free because we don't want to have any cost for scalarizing
892 // operations, and we don't have to copy into a different register class.
893
894 // Dynamic indexing isn't free and is best avoided.
895 return Index == ~0u ? 2 : 0;
896 }
897 default:
898 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
899 }
900}
901
902/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
903/// this is analyzing the collective result of all output registers. Otherwise,
904/// this is only querying a specific result index if this returns multiple
905/// registers in a struct.
906bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
907 const CallInst *CI, ArrayRef<unsigned> Indices) const {
908 // TODO: Handle complex extract indices
909 if (Indices.size() > 1)
910 return true;
911
912 const DataLayout &DL = CI->getModule()->getDataLayout();
913 const SIRegisterInfo *TRI = ST->getRegisterInfo();
914 TargetLowering::AsmOperandInfoVector TargetConstraints =
915 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
916
917 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
918
919 int OutputIdx = 0;
920 for (auto &TC : TargetConstraints) {
921 if (TC.Type != InlineAsm::isOutput)
922 continue;
923
924 // Skip outputs we don't care about.
925 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
926 continue;
927
928 TLI->ComputeConstraintToUse(TC, SDValue());
929
930 Register AssignedReg;
931 const TargetRegisterClass *RC;
932 std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
933 TRI, TC.ConstraintCode, TC.ConstraintVT);
934 if (AssignedReg) {
935 // FIXME: This is a workaround for getRegForInlineAsmConstraint
936 // returning VS_32
937 RC = TRI->getPhysRegClass(AssignedReg);
938 }
939
940 // For AGPR constraints null is returned on subtargets without AGPRs, so
941 // assume divergent for null.
942 if (!RC || !TRI->isSGPRClass(RC))
943 return true;
944 }
945
946 return false;
947}
948
949/// \returns true if the new GPU divergence analysis is enabled.
950bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
951 return !UseLegacyDA;
952}
953
954/// \returns true if the result of the value could potentially be
955/// different across workitems in a wavefront.
956bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
957 if (const Argument *A = dyn_cast<Argument>(V))
958 return !AMDGPU::isArgPassedInSGPR(A);
959
960 // Loads from the private and flat address spaces are divergent, because
961 // threads can execute the load instruction with the same inputs and get
962 // different results.
963 //
964 // All other loads are not divergent, because if threads issue loads with the
965 // same arguments, they will always get the same result.
966 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
967 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
968 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
969
970 // Atomics are divergent because they are executed sequentially: when an
971 // atomic operation refers to the same address in each thread, then each
972 // thread after the first sees the value written by the previous thread as
973 // original value.
974 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
975 return true;
976
977 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
978 return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
979
980 // Assume all function calls are a source of divergence.
981 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
982 if (CI->isInlineAsm())
983 return isInlineAsmSourceOfDivergence(CI);
984 return true;
985 }
986
987 // Assume all function calls are a source of divergence.
988 if (isa<InvokeInst>(V))
989 return true;
990
991 return false;
992}
993
994bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
995 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
996 switch (Intrinsic->getIntrinsicID()) {
997 default:
998 return false;
999 case Intrinsic::amdgcn_readfirstlane:
1000 case Intrinsic::amdgcn_readlane:
1001 case Intrinsic::amdgcn_icmp:
1002 case Intrinsic::amdgcn_fcmp:
1003 case Intrinsic::amdgcn_ballot:
1004 case Intrinsic::amdgcn_if_break:
1005 return true;
1006 }
1007 }
1008
1009 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1010 if (CI->isInlineAsm())
1011 return !isInlineAsmSourceOfDivergence(CI);
1012 return false;
1013 }
1014
1015 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1016 if (!ExtValue)
1017 return false;
1018
1019 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1020 if (!CI)
1021 return false;
1022
1023 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1024 switch (Intrinsic->getIntrinsicID()) {
1025 default:
1026 return false;
1027 case Intrinsic::amdgcn_if:
1028 case Intrinsic::amdgcn_else: {
1029 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1030 return Indices.size() == 1 && Indices[0] == 1;
1031 }
1032 }
1033 }
1034
1035 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1036 // divergent for the overall struct return. We need to override it in the
1037 // case we're extracting an SGPR component here.
1038 if (CI->isInlineAsm())
1039 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1040
1041 return false;
1042}
1043
1044bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
1045 Intrinsic::ID IID) const {
1046 switch (IID) {
1047 case Intrinsic::amdgcn_atomic_inc:
1048 case Intrinsic::amdgcn_atomic_dec:
1049 case Intrinsic::amdgcn_ds_fadd:
1050 case Intrinsic::amdgcn_ds_fmin:
1051 case Intrinsic::amdgcn_ds_fmax:
1052 case Intrinsic::amdgcn_is_shared:
1053 case Intrinsic::amdgcn_is_private:
1054 OpIndexes.push_back(0);
1055 return true;
1056 default:
1057 return false;
1058 }
1059}
1060
1061Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
1062 Value *OldV,
1063 Value *NewV) const {
1064 auto IntrID = II->getIntrinsicID();
1065 switch (IntrID) {
1066 case Intrinsic::amdgcn_atomic_inc:
1067 case Intrinsic::amdgcn_atomic_dec:
1068 case Intrinsic::amdgcn_ds_fadd:
1069 case Intrinsic::amdgcn_ds_fmin:
1070 case Intrinsic::amdgcn_ds_fmax: {
1071 const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
1072 if (!IsVolatile->isZero())
1073 return nullptr;
1074 Module *M = II->getParent()->getParent()->getParent();
1075 Type *DestTy = II->getType();
1076 Type *SrcTy = NewV->getType();
1077 Function *NewDecl =
1078 Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
1079 II->setArgOperand(0, NewV);
1080 II->setCalledFunction(NewDecl);
1081 return II;
1082 }
1083 case Intrinsic::amdgcn_is_shared:
1084 case Intrinsic::amdgcn_is_private: {
1085 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1086 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
1087 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1088 LLVMContext &Ctx = NewV->getType()->getContext();
1089 ConstantInt *NewVal = (TrueAS == NewAS) ?
1090 ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
1091 return NewVal;
1092 }
1093 case Intrinsic::ptrmask: {
1094 unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1095 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1096 Value *MaskOp = II->getArgOperand(1);
1097 Type *MaskTy = MaskOp->getType();
1098
1099 bool DoTruncate = false;
1100
1101 const GCNTargetMachine &TM =
1102 static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1103 if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1104 // All valid 64-bit to 32-bit casts work by chopping off the high
1105 // bits. Any masking only clearing the low bits will also apply in the new
1106 // address space.
1107 if (DL.getPointerSizeInBits(OldAS) != 64 ||
1108 DL.getPointerSizeInBits(NewAS) != 32)
1109 return nullptr;
1110
1111 // TODO: Do we need to thread more context in here?
1112 KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1113 if (Known.countMinLeadingOnes() < 32)
1114 return nullptr;
1115
1116 DoTruncate = true;
1117 }
1118
1119 IRBuilder<> B(II);
1120 if (DoTruncate) {
1121 MaskTy = B.getInt32Ty();
1122 MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1123 }
1124
1125 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1126 {NewV, MaskOp});
1127 }
1128 default:
1129 return nullptr;
1130 }
1131}
1132
1133InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1134 VectorType *VT, ArrayRef<int> Mask,
1135 int Index, VectorType *SubTp) {
1136 Kind = improveShuffleKindFromMask(Kind, Mask);
1137 if (ST->hasVOP3PInsts()) {
1138 if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1139 DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1140 // With op_sel VOP3P instructions freely can access the low half or high
1141 // half of a register, so any swizzle is free.
1142
1143 switch (Kind) {
1144 case TTI::SK_Broadcast:
1145 case TTI::SK_Reverse:
1146 case TTI::SK_PermuteSingleSrc:
1147 return 0;
1148 default:
1149 break;
1150 }
1151 }
1152 }
1153
1154 return BaseT::getShuffleCost(Kind, VT, Mask, Index, SubTp);
1155}
1156
1157bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
1158 const Function *Callee) const {
1159 const TargetMachine &TM = getTLI()->getTargetMachine();
1160 const GCNSubtarget *CallerST
1161 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1162 const GCNSubtarget *CalleeST
1163 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1164
1165 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1166 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1167
1168 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1169 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1170 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1171 return false;
1172
1173 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1174 // no way to support merge for backend defined attributes.
1175 AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
1176 AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
1177 if (!CallerMode.isInlineCompatible(CalleeMode))
1178 return false;
1179
1180 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1181 Callee->hasFnAttribute(Attribute::InlineHint))
1182 return true;
1183
1184 // Hack to make compile times reasonable.
1185 if (InlineMaxBB) {
1186 // Single BB does not increase total BB amount.
1187 if (Callee->size() == 1)
1188 return true;
1189 size_t BBSize = Caller->size() + Callee->size() - 1;
1190 return BBSize <= InlineMaxBB;
1191 }
1192
1193 return true;
1194}
1195
1196unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
1197 // If we have a pointer to private array passed into a function
1198 // it will not be optimized out, leaving scratch usage.
1199 // Increase the inline threshold to allow inlining in this case.
1200 uint64_t AllocaSize = 0;
1201 SmallPtrSet<const AllocaInst *, 8> AIVisited;
1202 for (Value *PtrArg : CB->args()) {
1203 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1204 if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
1205 Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
1206 continue;
1207
1208 PtrArg = getUnderlyingObject(PtrArg);
1209 if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
1210 if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1211 continue;
1212 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1213 // If the amount of stack memory is excessive we will not be able
1214 // to get rid of the scratch anyway, bail out.
1215 if (AllocaSize > ArgAllocaCutoff) {
1216 AllocaSize = 0;
1217 break;
1218 }
1219 }
1220 }
1221 if (AllocaSize)
1222 return ArgAllocaCost;
1223 return 0;
1224}
1225
1226void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1227 TTI::UnrollingPreferences &UP) {
1228 CommonTTI.getUnrollingPreferences(L, SE, UP);
1229}
1230
1231void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1232 TTI::PeelingPreferences &PP) {
1233 CommonTTI.getPeelingPreferences(L, SE, PP);
1234}
1235
1236int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1237 return ST->hasFullRate64Ops()
1238 ? getFullRateInstrCost()
1239 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1240 : getQuarterRateInstrCost(CostKind);
1241}
1242
1243R600TTIImpl::R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
1244 : BaseT(TM, F.getParent()->getDataLayout()),
1245 ST(static_cast<const R600Subtarget *>(TM->getSubtargetImpl(F))),
1246 TLI(ST->getTargetLowering()), CommonTTI(TM, F) {}
1247
1248unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
1249 return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
1250}
1251
1252unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
1253 return getHardwareNumberOfRegisters(Vec);
1254}
1255
1256TypeSize
1257R600TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
1258 return TypeSize::getFixed(32);
1259}
1260
1261unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
1262 return 32;
1263}
1264
1265unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
1266 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
1267 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
1268 return 128;
1269 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1270 AddrSpace == AMDGPUAS::REGION_ADDRESS)
1271 return 64;
1272 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
1273 return 32;
1274
1275 if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
1276 AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
1277 (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
1278 AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
1279 return 128;
1280 llvm_unreachable("unhandled address space")__builtin_unreachable();
1281}
1282
1283bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
1284 Align Alignment,
1285 unsigned AddrSpace) const {
1286 // We allow vectorization of flat stores, even though we may need to decompose
1287 // them later if they may access private memory. We don't have enough context
1288 // here, and legalization can handle it.
1289 return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
1290}
1291
1292bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1293 Align Alignment,
1294 unsigned AddrSpace) const {
1295 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1296}
1297
1298bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1299 Align Alignment,
1300 unsigned AddrSpace) const {
1301 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1302}
1303
1304unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
1305 // Disable unrolling if the loop is not vectorized.
1306 // TODO: Enable this again.
1307 if (VF == 1)
1308 return 1;
1309
1310 return 8;
1311}
1312
1313InstructionCost R600TTIImpl::getCFInstrCost(unsigned Opcode,
1314 TTI::TargetCostKind CostKind,
1315 const Instruction *I) {
1316 if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
1317 return Opcode == Instruction::PHI ? 0 : 1;
1318
1319 // XXX - For some reason this isn't called for switch.
1320 switch (Opcode) {
1321 case Instruction::Br:
1322 case Instruction::Ret:
1323 return 10;
1324 default:
1325 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1326 }
1327}
1328
1329InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
1330 unsigned Index) {
1331 switch (Opcode) {
1332 case Instruction::ExtractElement:
1333 case Instruction::InsertElement: {
1334 unsigned EltSize
1335 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
1336 if (EltSize < 32) {
1337 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1338 }
1339
1340 // Extracts are just reads of a subregister, so are free. Inserts are
1341 // considered free because we don't want to have any cost for scalarizing
1342 // operations, and we don't have to copy into a different register class.
1343
1344 // Dynamic indexing isn't free and is best avoided.
1345 return Index == ~0u ? 2 : 0;
1346 }
1347 default:
1348 return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1349 }
1350}
1351
1352void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1353 TTI::UnrollingPreferences &UP) {
1354 CommonTTI.getUnrollingPreferences(L, SE, UP);
1355}
1356
1357void R600TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1358 TTI::PeelingPreferences &PP) {
1359 CommonTTI.getPeelingPreferences(L, SE, PP);
1360}

/usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/CodeGen/BasicTTIImpl.h

1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/BitVector.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/TargetTransformInfo.h"
26#include "llvm/Analysis/TargetTransformInfoImpl.h"
27#include "llvm/CodeGen/ISDOpcodes.h"
28#include "llvm/CodeGen/TargetLowering.h"
29#include "llvm/CodeGen/TargetSubtargetInfo.h"
30#include "llvm/CodeGen/ValueTypes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
34#include "llvm/IR/DataLayout.h"
35#include "llvm/IR/DerivedTypes.h"
36#include "llvm/IR/InstrTypes.h"
37#include "llvm/IR/Instruction.h"
38#include "llvm/IR/Instructions.h"
39#include "llvm/IR/Intrinsics.h"
40#include "llvm/IR/Operator.h"
41#include "llvm/IR/Type.h"
42#include "llvm/IR/Value.h"
43#include "llvm/Support/Casting.h"
44#include "llvm/Support/CommandLine.h"
45#include "llvm/Support/ErrorHandling.h"
46#include "llvm/Support/MachineValueType.h"
47#include "llvm/Support/MathExtras.h"
48#include "llvm/Target/TargetMachine.h"
49#include <algorithm>
50#include <cassert>
51#include <cstdint>
52#include <limits>
53#include <utility>
54
55namespace llvm {
56
57class Function;
58class GlobalValue;
59class LLVMContext;
60class ScalarEvolution;
61class SCEV;
62class TargetMachine;
63
64extern cl::opt<unsigned> PartialUnrollingThreshold;
65
66/// Base class which can be used to help build a TTI implementation.
67///
68/// This class provides as much implementation of the TTI interface as is
69/// possible using the target independent parts of the code generator.
70///
71/// In order to subclass it, your class must implement a getST() method to
72/// return the subtarget, and a getTLI() method to return the target lowering.
73/// We need these methods implemented in the derived class so that this class
74/// doesn't have to duplicate storage for them.
75template <typename T>
76class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
77private:
78 using BaseT = TargetTransformInfoImplCRTPBase<T>;
79 using TTI = TargetTransformInfo;
80
81 /// Helper function to access this as a T.
82 T *thisT() { return static_cast<T *>(this); }
83
84 /// Estimate a cost of Broadcast as an extract and sequence of insert
85 /// operations.
86 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) {
87 InstructionCost Cost = 0;
88 // Broadcast cost is equal to the cost of extracting the zero'th element
89 // plus the cost of inserting it into every element of the result vector.
90 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);
91
92 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
93 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
94 }
95 return Cost;
96 }
97
98 /// Estimate a cost of shuffle as a sequence of extract and insert
99 /// operations.
100 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) {
101 InstructionCost Cost = 0;
102 // Shuffle cost is equal to the cost of extracting element from its argument
103 // plus the cost of inserting them onto the result vector.
104
105 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
106 // index 0 of first vector, index 1 of second vector,index 2 of first
107 // vector and finally index 3 of second vector and insert them at index
108 // <0,1,2,3> of result vector.
109 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
110 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
111 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i);
112 }
113 return Cost;
114 }
115
116 /// Estimate a cost of subvector extraction as a sequence of extract and
117 /// insert operations.
118 InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index,
119 FixedVectorType *SubVTy) {
120 assert(VTy && SubVTy &&((void)0)
121 "Can only extract subvectors from vectors")((void)0);
122 int NumSubElts = SubVTy->getNumElements();
123 assert((!isa<FixedVectorType>(VTy) ||((void)0)
124 (Index + NumSubElts) <=((void)0)
125 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&((void)0)
126 "SK_ExtractSubvector index out of range")((void)0);
127
128 InstructionCost Cost = 0;
129 // Subvector extraction cost is equal to the cost of extracting element from
130 // the source type plus the cost of inserting them into the result vector
131 // type.
132 for (int i = 0; i != NumSubElts; ++i) {
133 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
134 i + Index);
135 Cost +=
136 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i);
137 }
138 return Cost;
139 }
140
141 /// Estimate a cost of subvector insertion as a sequence of extract and
142 /// insert operations.
143 InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index,
144 FixedVectorType *SubVTy) {
145 assert(VTy && SubVTy &&((void)0)
146 "Can only insert subvectors into vectors")((void)0);
147 int NumSubElts = SubVTy->getNumElements();
148 assert((!isa<FixedVectorType>(VTy) ||((void)0)
149 (Index + NumSubElts) <=((void)0)
150 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&((void)0)
151 "SK_InsertSubvector index out of range")((void)0);
152
153 InstructionCost Cost = 0;
154 // Subvector insertion cost is equal to the cost of extracting element from
155 // the source type plus the cost of inserting them into the result vector
156 // type.
157 for (int i = 0; i != NumSubElts; ++i) {
158 Cost +=
159 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i);
160 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
161 i + Index);
162 }
163 return Cost;
164 }
165
166 /// Local query method delegates up to T which *must* implement this!
167 const TargetSubtargetInfo *getST() const {
168 return static_cast<const T *>(this)->getST();
169 }
170
171 /// Local query method delegates up to T which *must* implement this!
172 const TargetLoweringBase *getTLI() const {
173 return static_cast<const T *>(this)->getTLI();
174 }
175
176 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
177 switch (M) {
178 case TTI::MIM_Unindexed:
179 return ISD::UNINDEXED;
180 case TTI::MIM_PreInc:
181 return ISD::PRE_INC;
182 case TTI::MIM_PreDec:
183 return ISD::PRE_DEC;
184 case TTI::MIM_PostInc:
185 return ISD::POST_INC;
186 case TTI::MIM_PostDec:
187 return ISD::POST_DEC;
188 }
189 llvm_unreachable("Unexpected MemIndexedMode")__builtin_unreachable();
190 }
191
192 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
193 Align Alignment,
194 bool VariableMask,
195 bool IsGatherScatter,
196 TTI::TargetCostKind CostKind) {
197 auto *VT = cast<FixedVectorType>(DataTy);
198 // Assume the target does not have support for gather/scatter operations
199 // and provide a rough estimate.
200 //
201 // First, compute the cost of the individual memory operations.
202 InstructionCost AddrExtractCost =
203 IsGatherScatter
204 ? getVectorInstrCost(Instruction::ExtractElement,
205 FixedVectorType::get(
206 PointerType::get(VT->getElementType(), 0),
207 VT->getNumElements()),
208 -1)
209 : 0;
210 InstructionCost LoadCost =
211 VT->getNumElements() *
212 (AddrExtractCost +
213 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
214
215 // Next, compute the cost of packing the result in a vector.
216 InstructionCost PackingCost = getScalarizationOverhead(
217 VT, Opcode != Instruction::Store, Opcode == Instruction::Store);
218
219 InstructionCost ConditionalCost = 0;
220 if (VariableMask) {
221 // Compute the cost of conditionally executing the memory operations with
222 // variable masks. This includes extracting the individual conditions, a
223 // branches and PHIs to combine the results.
224 // NOTE: Estimating the cost of conditionally executing the memory
225 // operations accurately is quite difficult and the current solution
226 // provides a very rough estimate only.
227 ConditionalCost =
228 VT->getNumElements() *
229 (getVectorInstrCost(
230 Instruction::ExtractElement,
231 FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
232 VT->getNumElements()),
233 -1) +
234 getCFInstrCost(Instruction::Br, CostKind) +
235 getCFInstrCost(Instruction::PHI, CostKind));
236 }
237
238 return LoadCost + PackingCost + ConditionalCost;
239 }
240
241protected:
242 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
243 : BaseT(DL) {}
244 virtual ~BasicTTIImplBase() = default;
245
246 using TargetTransformInfoImplBase::DL;
247
248public:
249 /// \name Scalar TTI Implementations
250 /// @{
251 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
252 unsigned AddressSpace, Align Alignment,
253 bool *Fast) const {
254 EVT E = EVT::getIntegerVT(Context, BitWidth);
255 return getTLI()->allowsMisalignedMemoryAccesses(
256 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
257 }
258
259 bool hasBranchDivergence() { return false; }
260
261 bool useGPUDivergenceAnalysis() { return false; }
262
263 bool isSourceOfDivergence(const Value *V) { return false; }
264
265 bool isAlwaysUniform(const Value *V) { return false; }
266
267 unsigned getFlatAddressSpace() {
268 // Return an invalid address space.
269 return -1;
270 }
271
272 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
273 Intrinsic::ID IID) const {
274 return false;
275 }
276
277 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
278 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
279 }
280
281 unsigned getAssumedAddrSpace(const Value *V) const {
282 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
283 }
284
285 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
286 Value *NewV) const {
287 return nullptr;
288 }
289
290 bool isLegalAddImmediate(int64_t imm) {
291 return getTLI()->isLegalAddImmediate(imm);
292 }
293
294 bool isLegalICmpImmediate(int64_t imm) {
295 return getTLI()->isLegalICmpImmediate(imm);
296 }
297
298 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
299 bool HasBaseReg, int64_t Scale,
300 unsigned AddrSpace, Instruction *I = nullptr) {
301 TargetLoweringBase::AddrMode AM;
302 AM.BaseGV = BaseGV;
303 AM.BaseOffs = BaseOffset;
304 AM.HasBaseReg = HasBaseReg;
305 AM.Scale = Scale;
306 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
307 }
308
309 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
310 const DataLayout &DL) const {
311 EVT VT = getTLI()->getValueType(DL, Ty);
312 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
313 }
314
315 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
316 const DataLayout &DL) const {
317 EVT VT = getTLI()->getValueType(DL, Ty);
318 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
319 }
320
321 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
322 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
323 }
324
325 bool isNumRegsMajorCostOfLSR() {
326 return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
327 }
328
329 bool isProfitableLSRChainElement(Instruction *I) {
330 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
331 }
332
333 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
334 int64_t BaseOffset, bool HasBaseReg,
335 int64_t Scale, unsigned AddrSpace) {
336 TargetLoweringBase::AddrMode AM;
337 AM.BaseGV = BaseGV;
338 AM.BaseOffs = BaseOffset;
339 AM.HasBaseReg = HasBaseReg;
340 AM.Scale = Scale;
341 return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
342 }
343
344 bool isTruncateFree(Type *Ty1, Type *Ty2) {
345 return getTLI()->isTruncateFree(Ty1, Ty2);
346 }
347
348 bool isProfitableToHoist(Instruction *I) {
349 return getTLI()->isProfitableToHoist(I);
350 }
351
352 bool useAA() const { return getST()->useAA(); }
353
354 bool isTypeLegal(Type *Ty) {
355 EVT VT = getTLI()->getValueType(DL, Ty);
356 return getTLI()->isTypeLegal(VT);
357 }
358
359 InstructionCost getRegUsageForType(Type *Ty) {
360 InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
361 assert(Val >= 0 && "Negative cost!")((void)0);
362 return Val;
363 }
364
365 InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
366 ArrayRef<const Value *> Operands) {
367 return BaseT::getGEPCost(PointeeType, Ptr, Operands);
368 }
369
370 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
371 unsigned &JumpTableSize,
372 ProfileSummaryInfo *PSI,
373 BlockFrequencyInfo *BFI) {
374 /// Try to find the estimated number of clusters. Note that the number of
375 /// clusters identified in this function could be different from the actual
376 /// numbers found in lowering. This function ignore switches that are
377 /// lowered with a mix of jump table / bit test / BTree. This function was
378 /// initially intended to be used when estimating the cost of switch in
379 /// inline cost heuristic, but it's a generic cost model to be used in other
380 /// places (e.g., in loop unrolling).
381 unsigned N = SI.getNumCases();
382 const TargetLoweringBase *TLI = getTLI();
383 const DataLayout &DL = this->getDataLayout();
384
385 JumpTableSize = 0;
386 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
387
388 // Early exit if both a jump table and bit test are not allowed.
389 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
390 return N;
391
392 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
393 APInt MinCaseVal = MaxCaseVal;
394 for (auto CI : SI.cases()) {
395 const APInt &CaseVal = CI.getCaseValue()->getValue();
396 if (CaseVal.sgt(MaxCaseVal))
397 MaxCaseVal = CaseVal;
398 if (CaseVal.slt(MinCaseVal))
399 MinCaseVal = CaseVal;
400 }
401
402 // Check if suitable for a bit test
403 if (N <= DL.getIndexSizeInBits(0u)) {
404 SmallPtrSet<const BasicBlock *, 4> Dests;
405 for (auto I : SI.cases())
406 Dests.insert(I.getCaseSuccessor());
407
408 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
409 DL))
410 return 1;
411 }
412
413 // Check if suitable for a jump table.
414 if (IsJTAllowed) {
415 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
416 return N;
417 uint64_t Range =
418 (MaxCaseVal - MinCaseVal)
419 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
420 // Check whether a range of clusters is dense enough for a jump table
421 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
422 JumpTableSize = Range;
423 return 1;
424 }
425 }
426 return N;
427 }
428
429 bool shouldBuildLookupTables() {
430 const TargetLoweringBase *TLI = getTLI();
431 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
432 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
433 }
434
435 bool shouldBuildRelLookupTables() const {
436 const TargetMachine &TM = getTLI()->getTargetMachine();
437 // If non-PIC mode, do not generate a relative lookup table.
438 if (!TM.isPositionIndependent())
439 return false;
440
441 /// Relative lookup table entries consist of 32-bit offsets.
442 /// Do not generate relative lookup tables for large code models
443 /// in 64-bit achitectures where 32-bit offsets might not be enough.
444 if (TM.getCodeModel() == CodeModel::Medium ||
445 TM.getCodeModel() == CodeModel::Large)
446 return false;
447
448 Triple TargetTriple = TM.getTargetTriple();
449 if (!TargetTriple.isArch64Bit())
450 return false;
451
452 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
453 // there.
454 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
455 return false;
456
457 return true;
458 }
459
460 bool haveFastSqrt(Type *Ty) {
461 const TargetLoweringBase *TLI = getTLI();
462 EVT VT = TLI->getValueType(DL, Ty);
463 return TLI->isTypeLegal(VT) &&
464 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
465 }
466
467 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
468 return true;
469 }
470
471 InstructionCost getFPOpCost(Type *Ty) {
472 // Check whether FADD is available, as a proxy for floating-point in
473 // general.
474 const TargetLoweringBase *TLI = getTLI();
475 EVT VT = TLI->getValueType(DL, Ty);
476 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
477 return TargetTransformInfo::TCC_Basic;
478 return TargetTransformInfo::TCC_Expensive;
479 }
480
481 unsigned getInliningThresholdMultiplier() { return 1; }
482 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
483
484 int getInlinerVectorBonusPercent() { return 150; }
485
486 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
487 TTI::UnrollingPreferences &UP) {
488 // This unrolling functionality is target independent, but to provide some
489 // motivation for its intended use, for x86:
490
491 // According to the Intel 64 and IA-32 Architectures Optimization Reference
492 // Manual, Intel Core models and later have a loop stream detector (and
493 // associated uop queue) that can benefit from partial unrolling.
494 // The relevant requirements are:
495 // - The loop must have no more than 4 (8 for Nehalem and later) branches
496 // taken, and none of them may be calls.
497 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
498
499 // According to the Software Optimization Guide for AMD Family 15h
500 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
501 // and loop buffer which can benefit from partial unrolling.
502 // The relevant requirements are:
503 // - The loop must have fewer than 16 branches
504 // - The loop must have less than 40 uops in all executed loop branches
505
506 // The number of taken branches in a loop is hard to estimate here, and
507 // benchmarking has revealed that it is better not to be conservative when
508 // estimating the branch count. As a result, we'll ignore the branch limits
509 // until someone finds a case where it matters in practice.
510
511 unsigned MaxOps;
512 const TargetSubtargetInfo *ST = getST();
513 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
514 MaxOps = PartialUnrollingThreshold;
515 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
516 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
517 else
518 return;
519
520 // Scan the loop: don't unroll loops with calls.
521 for (BasicBlock *BB : L->blocks()) {
522 for (Instruction &I : *BB) {
523 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
524 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
525 if (!thisT()->isLoweredToCall(F))
526 continue;
527 }
528
529 return;
530 }
531 }
532 }
533
534 // Enable runtime and partial unrolling up to the specified size.
535 // Enable using trip count upper bound to unroll loops.
536 UP.Partial = UP.Runtime = UP.UpperBound = true;
537 UP.PartialThreshold = MaxOps;
538
539 // Avoid unrolling when optimizing for size.
540 UP.OptSizeThreshold = 0;
541 UP.PartialOptSizeThreshold = 0;
542
543 // Set number of instructions optimized when "back edge"
544 // becomes "fall through" to default value of 2.
545 UP.BEInsns = 2;
546 }
547
548 void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
549 TTI::PeelingPreferences &PP) {
550 PP.PeelCount = 0;
551 PP.AllowPeeling = true;
552 PP.AllowLoopNestsPeeling = false;
553 PP.PeelProfiledIterations = true;
554 }
555
556 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
557 AssumptionCache &AC,
558 TargetLibraryInfo *LibInfo,
559 HardwareLoopInfo &HWLoopInfo) {
560 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
561 }
562
563 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
564 AssumptionCache &AC, TargetLibraryInfo *TLI,
565 DominatorTree *DT,
566 const LoopAccessInfo *LAI) {
567 return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
568 }
569
570 bool emitGetActiveLaneMask() {
571 return BaseT::emitGetActiveLaneMask();
572 }
573
574 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
575 IntrinsicInst &II) {
576 return BaseT::instCombineIntrinsic(IC, II);
577 }
578
579 Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
580 IntrinsicInst &II,
581 APInt DemandedMask,
582 KnownBits &Known,
583 bool &KnownBitsComputed) {
584 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
585 KnownBitsComputed);
586 }
587
588 Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
589 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
590 APInt &UndefElts2, APInt &UndefElts3,
591 std::function<void(Instruction *, unsigned, APInt, APInt &)>
592 SimplifyAndSetOp) {
593 return BaseT::simplifyDemandedVectorEltsIntrinsic(
594 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
595 SimplifyAndSetOp);
596 }
597
598 InstructionCost getInstructionLatency(const Instruction *I) {
599 if (isa<LoadInst>(I))
600 return getST()->getSchedModel().DefaultLoadLatency;
601
602 return BaseT::getInstructionLatency(I);
603 }
604
605 virtual Optional<unsigned>
606 getCacheSize(TargetTransformInfo::CacheLevel Level) const {
607 return Optional<unsigned>(
608 getST()->getCacheSize(static_cast<unsigned>(Level)));
609 }
610
611 virtual Optional<unsigned>
612 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
613 Optional<unsigned> TargetResult =
614 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
615
616 if (TargetResult)
617 return TargetResult;
618
619 return BaseT::getCacheAssociativity(Level);
620 }
621
622 virtual unsigned getCacheLineSize() const {
623 return getST()->getCacheLineSize();
624 }
625
626 virtual unsigned getPrefetchDistance() const {
627 return getST()->getPrefetchDistance();
628 }
629
630 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
631 unsigned NumStridedMemAccesses,
632 unsigned NumPrefetches,
633 bool HasCall) const {
634 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
635 NumPrefetches, HasCall);
636 }
637
638 virtual unsigned getMaxPrefetchIterationsAhead() const {
639 return getST()->getMaxPrefetchIterationsAhead();
640 }
641
642 virtual bool enableWritePrefetching() const {
643 return getST()->enableWritePrefetching();
644 }
645
646 /// @}
647
648 /// \name Vector TTI Implementations
649 /// @{
650
651 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
652 return TypeSize::getFixed(32);
653 }
654
655 Optional<unsigned> getMaxVScale() const { return None; }
656
657 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
658 /// are set if the demanded result elements need to be inserted and/or
659 /// extracted from vectors.
660 InstructionCost getScalarizationOverhead(VectorType *InTy,
661 const APInt &DemandedElts,
662 bool Insert, bool Extract) {
663 /// FIXME: a bitfield is not a reasonable abstraction for talking about
664 /// which elements are needed from a scalable vector
665 auto *Ty = cast<FixedVectorType>(InTy);
666
667 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&((void)0)
668 "Vector size mismatch")((void)0);
669
670 InstructionCost Cost = 0;
671
672 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
673 if (!DemandedElts[i])
674 continue;
675 if (Insert)
676 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i);
677 if (Extract)
678 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
679 }
680
681 return Cost;
682 }
683
684 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
685 InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
686 bool Extract) {
687 auto *Ty = cast<FixedVectorType>(InTy);
688
689 APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements());
690 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
691 }
692
693 /// Estimate the overhead of scalarizing an instructions unique
694 /// non-constant operands. The (potentially vector) types to use for each of
695 /// argument are passes via Tys.
696 InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
697 ArrayRef<Type *> Tys) {
698 assert(Args.size() == Tys.size() && "Expected matching Args and Tys")((void)0);
699
700 InstructionCost Cost = 0;
701 SmallPtrSet<const Value*, 4> UniqueOperands;
702 for (int I = 0, E = Args.size(); I != E; I++) {
703 // Disregard things like metadata arguments.
704 const Value *A = Args[I];
705 Type *Ty = Tys[I];
706 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
707 !Ty->isPtrOrPtrVectorTy())
708 continue;
709
710 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
711 if (auto *VecTy = dyn_cast<VectorType>(Ty))
712 Cost += getScalarizationOverhead(VecTy, false, true);
713 }
714 }
715
716 return Cost;
717 }
718
719 /// Estimate the overhead of scalarizing the inputs and outputs of an
720 /// instruction, with return type RetTy and arguments Args of type Tys. If
721 /// Args are unknown (empty), then the cost associated with one argument is
722 /// added as a heuristic.
723 InstructionCost getScalarizationOverhead(VectorType *RetTy,
724 ArrayRef<const Value *> Args,
725 ArrayRef<Type *> Tys) {
726 InstructionCost Cost = getScalarizationOverhead(RetTy, true, false);
727 if (!Args.empty())
728 Cost += getOperandsScalarizationOverhead(Args, Tys);
729 else
730 // When no information on arguments is provided, we add the cost
731 // associated with one argument as a heuristic.
732 Cost += getScalarizationOverhead(RetTy, false, true);
733
734 return Cost;
735 }
736
737 unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
738
739 InstructionCost getArithmeticInstrCost(
740 unsigned Opcode, Type *Ty,
741 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
742 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
743 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
744 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
745 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
746 ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
747 const Instruction *CxtI = nullptr) {
748 // Check if any of the operands are vector operands.
749 const TargetLoweringBase *TLI = getTLI();
750 int ISD = TLI->InstructionOpcodeToISD(Opcode);
751 assert(ISD && "Invalid opcode")((void)0);
752
753 // TODO: Handle more cost kinds.
754 if (CostKind != TTI::TCK_RecipThroughput)
755 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
756 Opd1Info, Opd2Info,
757 Opd1PropInfo, Opd2PropInfo,
758 Args, CxtI);
759
760 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
761
762 bool IsFloat = Ty->isFPOrFPVectorTy();
763 // Assume that floating point arithmetic operations cost twice as much as
764 // integer operations.
765 InstructionCost OpCost = (IsFloat ? 2 : 1);
766
767 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
768 // The operation is legal. Assume it costs 1.
769 // TODO: Once we have extract/insert subvector cost we need to use them.
770 return LT.first * OpCost;
771 }
772
773 if (!TLI->isOperationExpand(ISD, LT.second)) {
774 // If the operation is custom lowered, then assume that the code is twice
775 // as expensive.
776 return LT.first * 2 * OpCost;
777 }
778
779 // An 'Expand' of URem and SRem is special because it may default
780 // to expanding the operation into a sequence of sub-operations
781 // i.e. X % Y -> X-(X/Y)*Y.
782 if (ISD == ISD::UREM || ISD == ISD::SREM) {
783 bool IsSigned = ISD == ISD::SREM;
784 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
785 LT.second) ||
786 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
787 LT.second)) {
788 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
789 InstructionCost DivCost = thisT()->getArithmeticInstrCost(
790 DivOpc, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo,
791 Opd2PropInfo);
792 InstructionCost MulCost =
793 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
794 InstructionCost SubCost =
795 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
796 return DivCost + MulCost + SubCost;
797 }
798 }
799
800 // We cannot scalarize scalable vectors, so return Invalid.
801 if (isa<ScalableVectorType>(Ty))
802 return InstructionCost::getInvalid();
803
804 // Else, assume that we need to scalarize this op.
805 // TODO: If one of the types get legalized by splitting, handle this
806 // similarly to what getCastInstrCost() does.
807 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
808 InstructionCost Cost = thisT()->getArithmeticInstrCost(
809 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
810 Opd1PropInfo, Opd2PropInfo, Args, CxtI);
811 // Return the cost of multiple scalar invocation plus the cost of
812 // inserting and extracting the values.
813 SmallVector<Type *> Tys(Args.size(), Ty);
814 return getScalarizationOverhead(VTy, Args, Tys) +
815 VTy->getNumElements() * Cost;
816 }
817
818 // We don't know anything about this scalar instruction.
819 return OpCost;
820 }
821
822 TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
823 ArrayRef<int> Mask) const {
824 int Limit = Mask.size() * 2;
825 if (Mask.empty() ||
826 // Extra check required by isSingleSourceMaskImpl function (called by
827 // ShuffleVectorInst::isSingleSourceMask).
828 any_of(Mask, [Limit](int I) { return I >= Limit; }))
829 return Kind;
830 switch (Kind) {
831 case TTI::SK_PermuteSingleSrc:
832 if (ShuffleVectorInst::isReverseMask(Mask))
833 return TTI::SK_Reverse;
834 if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
835 return TTI::SK_Broadcast;
836 break;
837 case TTI::SK_PermuteTwoSrc:
838 if (ShuffleVectorInst::isSelectMask(Mask))
839 return TTI::SK_Select;
840 if (ShuffleVectorInst::isTransposeMask(Mask))
841 return TTI::SK_Transpose;
842 break;
843 case TTI::SK_Select:
844 case TTI::SK_Reverse:
845 case TTI::SK_Broadcast:
846 case TTI::SK_Transpose:
847 case TTI::SK_InsertSubvector:
848 case TTI::SK_ExtractSubvector:
849 case TTI::SK_Splice:
850 break;
851 }
852 return Kind;
853 }
854
855 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
856 ArrayRef<int> Mask, int Index,
857 VectorType *SubTp) {
858
859 switch (improveShuffleKindFromMask(Kind, Mask)) {
860 case TTI::SK_Broadcast:
861 return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp));
862 case TTI::SK_Select:
863 case TTI::SK_Splice:
864 case TTI::SK_Reverse:
865 case TTI::SK_Transpose:
866 case TTI::SK_PermuteSingleSrc:
867 case TTI::SK_PermuteTwoSrc:
868 return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp));
869 case TTI::SK_ExtractSubvector:
870 return getExtractSubvectorOverhead(Tp, Index,
871 cast<FixedVectorType>(SubTp));
872 case TTI::SK_InsertSubvector:
873 return getInsertSubvectorOverhead(Tp, Index,
874 cast<FixedVectorType>(SubTp));
875 }
876 llvm_unreachable("Unknown TTI::ShuffleKind")__builtin_unreachable();
877 }
878
879 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
880 TTI::CastContextHint CCH,
881 TTI::TargetCostKind CostKind,
882 const Instruction *I = nullptr) {
883 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
884 return 0;
885
886 const TargetLoweringBase *TLI = getTLI();
887 int ISD = TLI->InstructionOpcodeToISD(Opcode);
888 assert(ISD && "Invalid opcode")((void)0);
889 std::pair<InstructionCost, MVT> SrcLT =
890 TLI->getTypeLegalizationCost(DL, Src);
891 std::pair<InstructionCost, MVT> DstLT =
892 TLI->getTypeLegalizationCost(DL, Dst);
893
894 TypeSize SrcSize = SrcLT.second.getSizeInBits();
895 TypeSize DstSize = DstLT.second.getSizeInBits();
896 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
897 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
898
899 switch (Opcode) {
900 default:
901 break;
902 case Instruction::Trunc:
903 // Check for NOOP conversions.
904 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
905 return 0;
906 LLVM_FALLTHROUGH[[gnu::fallthrough]];
907 case Instruction::BitCast:
908 // Bitcast between types that are legalized to the same type are free and
909 // assume int to/from ptr of the same size is also free.
910 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
911 SrcSize == DstSize)
912 return 0;
913 break;
914 case Instruction::FPExt:
915 if (I && getTLI()->isExtFree(I))
916 return 0;
917 break;
918 case Instruction::ZExt:
919 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
920 return 0;
921 LLVM_FALLTHROUGH[[gnu::fallthrough]];
922 case Instruction::SExt:
923 if (I && getTLI()->isExtFree(I))
924 return 0;
925
926 // If this is a zext/sext of a load, return 0 if the corresponding
927 // extending load exists on target and the result type is legal.
928 if (CCH == TTI::CastContextHint::Normal) {
929 EVT ExtVT = EVT::getEVT(Dst);
930 EVT LoadVT = EVT::getEVT(Src);
931 unsigned LType =
932 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
933 if (DstLT.first == SrcLT.first &&
934 TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
935 return 0;
936 }
937 break;
938 case Instruction::AddrSpaceCast:
939 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
940 Dst->getPointerAddressSpace()))
941 return 0;
942 break;
943 }
944
945 auto *SrcVTy = dyn_cast<VectorType>(Src);
946 auto *DstVTy = dyn_cast<VectorType>(Dst);
947
948 // If the cast is marked as legal (or promote) then assume low cost.
949 if (SrcLT.first == DstLT.first &&
950 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
951 return SrcLT.first;
952
953 // Handle scalar conversions.
954 if (!SrcVTy && !DstVTy) {
955 // Just check the op cost. If the operation is legal then assume it costs
956 // 1.
957 if (!TLI->isOperationExpand(ISD, DstLT.second))
958 return 1;
959
960 // Assume that illegal scalar instruction are expensive.
961 return 4;
962 }
963
964 // Check vector-to-vector casts.
965 if (DstVTy && SrcVTy) {
966 // If the cast is between same-sized registers, then the check is simple.
967 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
968
969 // Assume that Zext is done using AND.
970 if (Opcode == Instruction::ZExt)
971 return SrcLT.first;
972
973 // Assume that sext is done using SHL and SRA.
974 if (Opcode == Instruction::SExt)
975 return SrcLT.first * 2;
976
977 // Just check the op cost. If the operation is legal then assume it
978 // costs
979 // 1 and multiply by the type-legalization overhead.
980 if (!TLI->isOperationExpand(ISD, DstLT.second))
981 return SrcLT.first * 1;
982 }
983
984 // If we are legalizing by splitting, query the concrete TTI for the cost
985 // of casting the original vector twice. We also need to factor in the
986 // cost of the split itself. Count that as 1, to be consistent with
987 // TLI->getTypeLegalizationCost().
988 bool SplitSrc =
989 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
990 TargetLowering::TypeSplitVector;
991 bool SplitDst =
992 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
993 TargetLowering::TypeSplitVector;
994 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
995 DstVTy->getElementCount().isVector()) {
996 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
997 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
998 T *TTI = static_cast<T *>(this);
999 // If both types need to be split then the split is free.
1000 InstructionCost SplitCost =
1001 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1002 return SplitCost +
1003 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1004 CostKind, I));
1005 }
1006
1007 // Scalarization cost is Invalid, can't assume any num elements.
1008 if (isa<ScalableVectorType>(DstVTy))
1009 return InstructionCost::getInvalid();
1010
1011 // In other cases where the source or destination are illegal, assume
1012 // the operation will get scalarized.
1013 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1014 InstructionCost Cost = thisT()->getCastInstrCost(
1015 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1016
1017 // Return the cost of multiple scalar invocation plus the cost of
1018 // inserting and extracting the values.
1019 return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
1020 }
1021
1022 // We already handled vector-to-vector and scalar-to-scalar conversions.
1023 // This
1024 // is where we handle bitcast between vectors and scalars. We need to assume
1025 // that the conversion is scalarized in one way or another.
1026 if (Opcode == Instruction::BitCast) {
1027 // Illegal bitcasts are done by storing and loading from a stack slot.
1028 return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
1029 (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
1030 }
1031
1032 llvm_unreachable("Unhandled cast")__builtin_unreachable();
1033 }
1034
1035 InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1036 VectorType *VecTy, unsigned Index) {
1037 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1038 Index) +
1039 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1040 TTI::CastContextHint::None,
1041 TTI::TCK_RecipThroughput);
1042 }
1043
1044 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1045 const Instruction *I = nullptr) {
1046 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1047 }
1048
1049 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1050 CmpInst::Predicate VecPred,
1051 TTI::TargetCostKind CostKind,
1052 const Instruction *I = nullptr) {
1053 const TargetLoweringBase *TLI = getTLI();
1054 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1055 assert(ISD && "Invalid opcode")((void)0);
1056
1057 // TODO: Handle other cost kinds.
1058 if (CostKind
24.1
'CostKind' is equal to TCK_RecipThroughput
24.1
'CostKind' is equal to TCK_RecipThroughput
!= TTI::TCK_RecipThroughput
)
13
Assuming 'CostKind' is equal to TCK_RecipThroughput
14
Taking false branch
25
Taking false branch
1059 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1060 I);
1061
1062 // Selects on vectors are actually vector selects.
1063 if (ISD == ISD::SELECT) {
15
Assuming 'ISD' is not equal to SELECT
16
Taking false branch
26
Assuming 'ISD' is equal to SELECT
27
Taking true branch
1064 assert(CondTy && "CondTy must exist")((void)0);
1065 if (CondTy->isVectorTy())
28
Called C++ object pointer is null
1066 ISD = ISD::VSELECT;
1067 }
1068 std::pair<InstructionCost, MVT> LT =
1069 TLI->getTypeLegalizationCost(DL, ValTy);
1070
1071 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
17
Taking false branch
1072 !TLI->isOperationExpand(ISD, LT.second)) {
1073 // The operation is legal. Assume it costs 1. Multiply
1074 // by the type-legalization overhead.
1075 return LT.first * 1;
1076 }
1077
1078 // Otherwise, assume that the cast is scalarized.
1079 // TODO: If one of the types get legalized by splitting, handle this
1080 // similarly to what getCastInstrCost() does.
1081 if (auto *ValVTy
18.1
'ValVTy' is non-null
18.1
'ValVTy' is non-null
= dyn_cast<VectorType>(ValTy)) {
18
Assuming 'ValTy' is a 'VectorType'
19
Taking true branch
1082 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
20
'ValVTy' is a 'FixedVectorType'
1083 if (CondTy)
21
Assuming 'CondTy' is null
22
Taking false branch
1084 CondTy = CondTy->getScalarType();
1085 InstructionCost Cost = thisT()->getCmpSelInstrCost(
24
Calling 'BasicTTIImplBase::getCmpSelInstrCost'
1086 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
23
Passing null pointer value via 3rd parameter 'CondTy'
1087
1088 // Return the cost of multiple scalar invocation plus the cost of
1089 // inserting and extracting the values.
1090 return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
1091 }
1092
1093 // Unknown scalar opcode.
1094 return 1;
1095 }
1096
1097 InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1098 unsigned Index) {
1099 std::pair<InstructionCost, MVT> LT =
1100 getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
1101
1102 return LT.first;
1103 }
1104
1105 InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
1106 MaybeAlign Alignment, unsigned AddressSpace,
1107 TTI::TargetCostKind CostKind,
1108 const Instruction *I = nullptr) {
1109 assert(!Src->isVoidTy() && "Invalid type")((void)0);
1110 // Assume types, such as structs, are expensive.
1111 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1112 return 4;
1113 std::pair<InstructionCost, MVT> LT =
1114 getTLI()->getTypeLegalizationCost(DL, Src);
1115
1116 // Assuming that all loads of legal types cost 1.
1117 InstructionCost Cost = LT.first;
1118 if (CostKind != TTI::TCK_RecipThroughput)
1119 return Cost;
1120
1121 if (Src->isVectorTy() &&
1122 // In practice it's not currently possible to have a change in lane
1123 // length for extending loads or truncating stores so both types should
1124 // have the same scalable property.
1125 TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(),
1126 LT.second.getSizeInBits())) {
1127 // This is a vector load that legalizes to a larger type than the vector
1128 // itself. Unless the corresponding extending load or truncating store is
1129 // legal, then this will scalarize.
1130 TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1131 EVT MemVT = getTLI()->getValueType(DL, Src);
1132 if (Opcode == Instruction::Store)
1133 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1134 else
1135 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1136
1137 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1138 // This is a vector load/store for some illegal type that is scalarized.
1139 // We must account for the cost of building or decomposing the vector.
1140 Cost += getScalarizationOverhead(cast<VectorType>(Src),
1141 Opcode != Instruction::Store,
1142 Opcode == Instruction::Store);
1143 }
1144 }
1145
1146 return Cost;
1147 }
1148
1149 InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1150 Align Alignment, unsigned AddressSpace,
1151 TTI::TargetCostKind CostKind) {
1152 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1153 CostKind);
1154 }
1155
1156 InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1157 const Value *Ptr, bool VariableMask,
1158 Align Alignment,
1159 TTI::TargetCostKind CostKind,
1160 const Instruction *I = nullptr) {
1161 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1162 true, CostKind);
1163 }
1164
1165 InstructionCost getInterleavedMemoryOpCost(
1166 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1167 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1168 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1169 auto *VT = cast<FixedVectorType>(VecTy);
1170
1171 unsigned NumElts = VT->getNumElements();
1172 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor")((void)0);
1173
1174 unsigned NumSubElts = NumElts / Factor;
1175 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1176
1177 // Firstly, the cost of load/store operation.
1178 InstructionCost Cost;
1179 if (UseMaskForCond || UseMaskForGaps)
1180 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1181 AddressSpace, CostKind);
1182 else
1183 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1184 CostKind);
1185
1186 // Legalize the vector type, and get the legalized and unlegalized type
1187 // sizes.
1188 MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
1189 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1190 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1191
1192 // Scale the cost of the memory operation by the fraction of legalized
1193 // instructions that will actually be used. We shouldn't account for the
1194 // cost of dead instructions since they will be removed.
1195 //
1196 // E.g., An interleaved load of factor 8:
1197 // %vec = load <16 x i64>, <16 x i64>* %ptr
1198 // %v0 = shufflevector %vec, undef, <0, 8>
1199 //
1200 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1201 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1202 // type). The other loads are unused.
1203 //
1204 // We only scale the cost of loads since interleaved store groups aren't
1205 // allowed to have gaps.
1206 if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
1207 // The number of loads of a legal type it will take to represent a load
1208 // of the unlegalized vector type.
1209 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1210
1211 // The number of elements of the unlegalized type that correspond to a
1212 // single legal instruction.
1213 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1214
1215 // Determine which legal instructions will be used.
1216 BitVector UsedInsts(NumLegalInsts, false);
1217 for (unsigned Index : Indices)
1218 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1219 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1220
1221 // Scale the cost of the load by the fraction of legal instructions that
1222 // will be used.
1223 Cost *= UsedInsts.count() / NumLegalInsts;
1224 }
1225
1226 // Then plus the cost of interleave operation.
1227 if (Opcode == Instruction::Load) {
1228 // The interleave cost is similar to extract sub vectors' elements
1229 // from the wide vector, and insert them into sub vectors.
1230 //
1231 // E.g. An interleaved load of factor 2 (with one member of index 0):
1232 // %vec = load <8 x i32>, <8 x i32>* %ptr
1233 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1234 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1235 // <8 x i32> vector and insert them into a <4 x i32> vector.
1236
1237 assert(Indices.size() <= Factor &&((void)0)
1238 "Interleaved memory op has too many members")((void)0);
1239
1240 for (unsigned Index : Indices) {
1241 assert(Index < Factor && "Invalid index for interleaved memory op")((void)0);
1242
1243 // Extract elements from loaded vector for each sub vector.
1244 for (unsigned i = 0; i < NumSubElts; i++)
1245 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT,
1246 Index + i * Factor);
1247 }
1248
1249 InstructionCost InsSubCost = 0;
1250 for (unsigned i = 0; i < NumSubElts; i++)
1251 InsSubCost +=
1252 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, i);
1253
1254 Cost += Indices.size() * InsSubCost;
1255 } else {
1256 // The interleave cost is extract all elements from sub vectors, and
1257 // insert them into the wide vector.
1258 //
1259 // E.g. An interleaved store of factor 2:
1260 // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
1261 // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
1262 // The cost is estimated as extract all elements from both <4 x i32>
1263 // vectors and insert into the <8 x i32> vector.
1264
1265 InstructionCost ExtSubCost = 0;
1266 for (unsigned i = 0; i < NumSubElts; i++)
1267 ExtSubCost +=
1268 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
1269 Cost += ExtSubCost * Factor;
1270
1271 for (unsigned i = 0; i < NumElts; i++)
1272 Cost += static_cast<T *>(this)
1273 ->getVectorInstrCost(Instruction::InsertElement, VT, i);
1274 }
1275
1276 if (!UseMaskForCond)
1277 return Cost;
1278
1279 Type *I8Type = Type::getInt8Ty(VT->getContext());
1280 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1281 SubVT = FixedVectorType::get(I8Type, NumSubElts);
1282
1283 // The Mask shuffling cost is extract all the elements of the Mask
1284 // and insert each of them Factor times into the wide vector:
1285 //
1286 // E.g. an interleaved group with factor 3:
1287 // %mask = icmp ult <8 x i32> %vec1, %vec2
1288 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1289 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1290 // The cost is estimated as extract all mask elements from the <8xi1> mask
1291 // vector and insert them factor times into the <24xi1> shuffled mask
1292 // vector.
1293 for (unsigned i = 0; i < NumSubElts; i++)
1294 Cost +=
1295 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
1296
1297 for (unsigned i = 0; i < NumElts; i++)
1298 Cost +=
1299 thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i);
1300
1301 // The Gaps mask is invariant and created outside the loop, therefore the
1302 // cost of creating it is not accounted for here. However if we have both
1303 // a MaskForGaps and some other mask that guards the execution of the
1304 // memory access, we need to account for the cost of And-ing the two masks
1305 // inside the loop.
1306 if (UseMaskForGaps)
1307 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1308 CostKind);
1309
1310 return Cost;
1311 }
1312
1313 /// Get intrinsic cost based on arguments.
1314 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1315 TTI::TargetCostKind CostKind) {
1316 // Check for generically free intrinsics.
1317 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1318 return 0;
1319
1320 // Assume that target intrinsics are cheap.
1321 Intrinsic::ID IID = ICA.getID();
1322 if (Function::isTargetIntrinsic(IID))
1323 return TargetTransformInfo::TCC_Basic;
1324
1325 if (ICA.isTypeBasedOnly())
1326 return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1327
1328 Type *RetTy = ICA.getReturnType();
1329
1330 ElementCount RetVF =
1331 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1332 : ElementCount::getFixed(1));
1333 const IntrinsicInst *I = ICA.getInst();
1334 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1335 FastMathFlags FMF = ICA.getFlags();
1336 switch (IID) {
1337 default:
1338 break;
1339
1340 case Intrinsic::cttz:
1341 // FIXME: If necessary, this should go in target-specific overrides.
1342 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
1343 return TargetTransformInfo::TCC_Basic;
1344 break;
1345
1346 case Intrinsic::ctlz:
1347 // FIXME: If necessary, this should go in target-specific overrides.
1348 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
1349 return TargetTransformInfo::TCC_Basic;
1350 break;
1351
1352 case Intrinsic::memcpy:
1353 return thisT()->getMemcpyCost(ICA.getInst());
1354
1355 case Intrinsic::masked_scatter: {
1356 const Value *Mask = Args[3];
1357 bool VarMask = !isa<Constant>(Mask);
1358 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1359 return thisT()->getGatherScatterOpCost(Instruction::Store,
1360 ICA.getArgTypes()[0], Args[1],
1361 VarMask, Alignment, CostKind, I);
1362 }
1363 case Intrinsic::masked_gather: {
1364 const Value *Mask = Args[2];
1365 bool VarMask = !isa<Constant>(Mask);
1366 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1367 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1368 VarMask, Alignment, CostKind, I);
1369 }
1370 case Intrinsic::experimental_stepvector: {
1371 if (isa<ScalableVectorType>(RetTy))
1372 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1373 // The cost of materialising a constant integer vector.
1374 return TargetTransformInfo::TCC_Basic;
1375 }
1376 case Intrinsic::experimental_vector_extract: {
1377 // FIXME: Handle case where a scalable vector is extracted from a scalable
1378 // vector
1379 if (isa<ScalableVectorType>(RetTy))
1380 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1381 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1382 return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1383 cast<VectorType>(Args[0]->getType()), None,
1384 Index, cast<VectorType>(RetTy));
1385 }
1386 case Intrinsic::experimental_vector_insert: {
1387 // FIXME: Handle case where a scalable vector is inserted into a scalable
1388 // vector
1389 if (isa<ScalableVectorType>(Args[1]->getType()))
1390 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1391 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1392 return thisT()->getShuffleCost(
1393 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None,
1394 Index, cast<VectorType>(Args[1]->getType()));
1395 }
1396 case Intrinsic::experimental_vector_reverse: {
1397 return thisT()->getShuffleCost(TTI::SK_Reverse,
1398 cast<VectorType>(Args[0]->getType()), None,
1399 0, cast<VectorType>(RetTy));
1400 }
1401 case Intrinsic::experimental_vector_splice: {
1402 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1403 return thisT()->getShuffleCost(TTI::SK_Splice,
1404 cast<VectorType>(Args[0]->getType()), None,
1405 Index, cast<VectorType>(RetTy));
1406 }
1407 case Intrinsic::vector_reduce_add:
1408 case Intrinsic::vector_reduce_mul:
1409 case Intrinsic::vector_reduce_and:
1410 case Intrinsic::vector_reduce_or:
1411 case Intrinsic::vector_reduce_xor:
1412 case Intrinsic::vector_reduce_smax:
1413 case Intrinsic::vector_reduce_smin:
1414 case Intrinsic::vector_reduce_fmax:
1415 case Intrinsic::vector_reduce_fmin:
1416 case Intrinsic::vector_reduce_umax:
1417 case Intrinsic::vector_reduce_umin: {
1418 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1419 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1420 }
1421 case Intrinsic::vector_reduce_fadd:
1422 case Intrinsic::vector_reduce_fmul: {
1423 IntrinsicCostAttributes Attrs(
1424 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1425 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1426 }
1427 case Intrinsic::fshl:
1428 case Intrinsic::fshr: {
1429 if (isa<ScalableVectorType>(RetTy))
1430 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1431 const Value *X = Args[0];
1432 const Value *Y = Args[1];
1433 const Value *Z = Args[2];
1434 TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
1435 TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
1436 TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
1437 TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
1438 TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
1439 OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1440 : TTI::OP_None;
1441 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1442 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1443 InstructionCost Cost = 0;
1444 Cost +=
1445 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1446 Cost +=
1447 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1448 Cost += thisT()->getArithmeticInstrCost(
1449 BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX);
1450 Cost += thisT()->getArithmeticInstrCost(
1451 BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY);
1452 // Non-constant shift amounts requires a modulo.
1453 if (OpKindZ != TTI::OK_UniformConstantValue &&
1454 OpKindZ != TTI::OK_NonUniformConstantValue)
1455 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1456 CostKind, OpKindZ, OpKindBW,
1457 OpPropsZ, OpPropsBW);
1458 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1459 if (X != Y) {
1460 Type *CondTy = RetTy->getWithNewBitWidth(1);
1461 Cost +=
1462 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1463 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1464 Cost +=
1465 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1466 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1467 }
1468 return Cost;
1469 }
1470 }
1471
1472 // Assume that we need to scalarize this intrinsic.
1473 // Compute the scalarization overhead based on Args for a vector
1474 // intrinsic.
1475 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1476 if (RetVF.isVector() && !RetVF.isScalable()) {
1477 ScalarizationCost = 0;
1478 if (!RetTy->isVoidTy())
1479 ScalarizationCost +=
1480 getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
1481 ScalarizationCost +=
1482 getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
1483 }
1484
1485 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1486 ScalarizationCost);
1487 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1488 }
1489
1490 /// Get intrinsic cost based on argument types.
1491 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1492 /// cost of scalarizing the arguments and the return value will be computed
1493 /// based on types.
1494 InstructionCost
1495 getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1496 TTI::TargetCostKind CostKind) {
1497 Intrinsic::ID IID = ICA.getID();
1498 Type *RetTy = ICA.getReturnType();
1499 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1500 FastMathFlags FMF = ICA.getFlags();
1501 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1502 bool SkipScalarizationCost = ICA.skipScalarizationCost();
1503
1504 VectorType *VecOpTy = nullptr;
1505 if (!Tys.empty()) {
1506 // The vector reduction operand is operand 0 except for fadd/fmul.
1507 // Their operand 0 is a scalar start value, so the vector op is operand 1.
1508 unsigned VecTyIndex = 0;
1509 if (IID == Intrinsic::vector_reduce_fadd ||
1510 IID == Intrinsic::vector_reduce_fmul)
1511 VecTyIndex = 1;
1512 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes")((void)0);
1513 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1514 }
1515
1516 // Library call cost - other than size, make it expensive.
1517 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1518 SmallVector<unsigned, 2> ISDs;
1519 switch (IID) {
1520 default: {
1521 // Scalable vectors cannot be scalarized, so return Invalid.
1522 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1523 return isa<ScalableVectorType>(Ty);
1524 }))
1525 return InstructionCost::getInvalid();
1526
1527 // Assume that we need to scalarize this intrinsic.
1528 InstructionCost ScalarizationCost =
1529 SkipScalarizationCost ? ScalarizationCostPassed : 0;
1530 unsigned ScalarCalls = 1;
1531 Type *ScalarRetTy = RetTy;
1532 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1533 if (!SkipScalarizationCost)
1534 ScalarizationCost = getScalarizationOverhead(RetVTy, true, false);
1535 ScalarCalls = std::max(ScalarCalls,
1536 cast<FixedVectorType>(RetVTy)->getNumElements());
1537 ScalarRetTy = RetTy->getScalarType();
1538 }
1539 SmallVector<Type *, 4> ScalarTys;
1540 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1541 Type *Ty = Tys[i];
1542 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1543 if (!SkipScalarizationCost)
1544 ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1545 ScalarCalls = std::max(ScalarCalls,
1546 cast<FixedVectorType>(VTy)->getNumElements());
1547 Ty = Ty->getScalarType();
1548 }
1549 ScalarTys.push_back(Ty);
1550 }
1551 if (ScalarCalls == 1)
1552 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1553
1554 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1555 InstructionCost ScalarCost =
1556 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1557
1558 return ScalarCalls * ScalarCost + ScalarizationCost;
1559 }
1560 // Look for intrinsics that can be lowered directly or turned into a scalar
1561 // intrinsic call.
1562 case Intrinsic::sqrt:
1563 ISDs.push_back(ISD::FSQRT);
1564 break;
1565 case Intrinsic::sin:
1566 ISDs.push_back(ISD::FSIN);
1567 break;
1568 case Intrinsic::cos:
1569 ISDs.push_back(ISD::FCOS);
1570 break;
1571 case Intrinsic::exp:
1572 ISDs.push_back(ISD::FEXP);
1573 break;
1574 case Intrinsic::exp2:
1575 ISDs.push_back(ISD::FEXP2);
1576 break;
1577 case Intrinsic::log:
1578 ISDs.push_back(ISD::FLOG);
1579 break;
1580 case Intrinsic::log10:
1581 ISDs.push_back(ISD::FLOG10);
1582 break;
1583 case Intrinsic::log2:
1584 ISDs.push_back(ISD::FLOG2);
1585 break;
1586 case Intrinsic::fabs:
1587 ISDs.push_back(ISD::FABS);
1588 break;
1589 case Intrinsic::canonicalize:
1590 ISDs.push_back(ISD::FCANONICALIZE);
1591 break;
1592 case Intrinsic::minnum:
1593 ISDs.push_back(ISD::FMINNUM);
1594 break;
1595 case Intrinsic::maxnum:
1596 ISDs.push_back(ISD::FMAXNUM);
1597 break;
1598 case Intrinsic::minimum:
1599 ISDs.push_back(ISD::FMINIMUM);
1600 break;
1601 case Intrinsic::maximum:
1602 ISDs.push_back(ISD::FMAXIMUM);
1603 break;
1604 case Intrinsic::copysign:
1605 ISDs.push_back(ISD::FCOPYSIGN);
1606 break;
1607 case Intrinsic::floor:
1608 ISDs.push_back(ISD::FFLOOR);
1609 break;
1610 case Intrinsic::ceil:
1611 ISDs.push_back(ISD::FCEIL);
1612 break;
1613 case Intrinsic::trunc:
1614 ISDs.push_back(ISD::FTRUNC);
1615 break;
1616 case Intrinsic::nearbyint:
1617 ISDs.push_back(ISD::FNEARBYINT);
1618 break;
1619 case Intrinsic::rint:
1620 ISDs.push_back(ISD::FRINT);
1621 break;
1622 case Intrinsic::round:
1623 ISDs.push_back(ISD::FROUND);
1624 break;
1625 case Intrinsic::roundeven:
1626 ISDs.push_back(ISD::FROUNDEVEN);
1627 break;
1628 case Intrinsic::pow:
1629 ISDs.push_back(ISD::FPOW);
1630 break;
1631 case Intrinsic::fma:
1632 ISDs.push_back(ISD::FMA);
1633 break;
1634 case Intrinsic::fmuladd:
1635 ISDs.push_back(ISD::FMA);
1636 break;
1637 case Intrinsic::experimental_constrained_fmuladd:
1638 ISDs.push_back(ISD::STRICT_FMA);
1639 break;
1640 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1641 case Intrinsic::lifetime_start:
1642 case Intrinsic::lifetime_end:
1643 case Intrinsic::sideeffect:
1644 case Intrinsic::pseudoprobe:
1645 case Intrinsic::arithmetic_fence:
1646 return 0;
1647 case Intrinsic::masked_store: {
1648 Type *Ty = Tys[0];
1649 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1650 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
1651 CostKind);
1652 }
1653 case Intrinsic::masked_load: {
1654 Type *Ty = RetTy;
1655 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1656 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
1657 CostKind);
1658 }
1659 case Intrinsic::vector_reduce_add:
1660 return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
1661 None, CostKind);
1662 case Intrinsic::vector_reduce_mul:
1663 return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
1664 None, CostKind);
1665 case Intrinsic::vector_reduce_and:
1666 return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
1667 None, CostKind);
1668 case Intrinsic::vector_reduce_or:
1669 return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, None,
1670 CostKind);
1671 case Intrinsic::vector_reduce_xor:
1672 return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
1673 None, CostKind);
1674 case Intrinsic::vector_reduce_fadd:
1675 return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
1676 FMF, CostKind);
1677 case Intrinsic::vector_reduce_fmul:
1678 return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
1679 FMF, CostKind);
1680 case Intrinsic::vector_reduce_smax:
1681 case Intrinsic::vector_reduce_smin:
1682 case Intrinsic::vector_reduce_fmax:
1683 case Intrinsic::vector_reduce_fmin:
1684 return thisT()->getMinMaxReductionCost(
1685 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1686 /*IsUnsigned=*/false, CostKind);
1687 case Intrinsic::vector_reduce_umax:
1688 case Intrinsic::vector_reduce_umin:
1689 return thisT()->getMinMaxReductionCost(
1690 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1691 /*IsUnsigned=*/true, CostKind);
1692 case Intrinsic::abs:
1693 case Intrinsic::smax:
1694 case Intrinsic::smin:
1695 case Intrinsic::umax:
1696 case Intrinsic::umin: {
1697 // abs(X) = select(icmp(X,0),X,sub(0,X))
1698 // minmax(X,Y) = select(icmp(X,Y),X,Y)
1699 Type *CondTy = RetTy->getWithNewBitWidth(1);
1700 InstructionCost Cost = 0;
1701 // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code.
1702 Cost +=
1703 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1704 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1705 Cost +=
1706 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1707 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1708 // TODO: Should we add an OperandValueProperties::OP_Zero property?
1709 if (IID == Intrinsic::abs)
1710 Cost += thisT()->getArithmeticInstrCost(
1711 BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
1712 return Cost;
1713 }
1714 case Intrinsic::sadd_sat:
1715 case Intrinsic::ssub_sat: {
1716 Type *CondTy = RetTy->getWithNewBitWidth(1);
1717
1718 Type *OpTy = StructType::create({RetTy, CondTy});
1719 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1720 ? Intrinsic::sadd_with_overflow
1721 : Intrinsic::ssub_with_overflow;
1722
1723 // SatMax -> Overflow && SumDiff < 0
1724 // SatMin -> Overflow && SumDiff >= 0
1725 InstructionCost Cost = 0;
1726 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1727 nullptr, ScalarizationCostPassed);
1728 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1729 Cost +=
1730 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1731 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1732 Cost += 2 * thisT()->getCmpSelInstrCost(
1733 BinaryOperator::Select, RetTy, CondTy,
1734 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1735 return Cost;
1736 }
1737 case Intrinsic::uadd_sat:
1738 case Intrinsic::usub_sat: {
1739 Type *CondTy = RetTy->getWithNewBitWidth(1);
1740
1741 Type *OpTy = StructType::create({RetTy, CondTy});
1742 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1743 ? Intrinsic::uadd_with_overflow
1744 : Intrinsic::usub_with_overflow;
1745
1746 InstructionCost Cost = 0;
1747 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1748 nullptr, ScalarizationCostPassed);
1749 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1750 Cost +=
1751 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1752 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1753 return Cost;
1754 }
1755 case Intrinsic::smul_fix:
1756 case Intrinsic::umul_fix: {
1757 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
1758 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
1759
1760 unsigned ExtOp =
1761 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1762 TTI::CastContextHint CCH = TTI::CastContextHint::None;
1763
1764 InstructionCost Cost = 0;
1765 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
1766 Cost +=
1767 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1768 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
1769 CCH, CostKind);
1770 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
1771 CostKind, TTI::OK_AnyValue,
1772 TTI::OK_UniformConstantValue);
1773 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
1774 TTI::OK_AnyValue,
1775 TTI::OK_UniformConstantValue);
1776 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
1777 return Cost;
1778 }
1779 case Intrinsic::sadd_with_overflow:
1780 case Intrinsic::ssub_with_overflow: {
1781 Type *SumTy = RetTy->getContainedType(0);
1782 Type *OverflowTy = RetTy->getContainedType(1);
1783 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
1784 ? BinaryOperator::Add
1785 : BinaryOperator::Sub;
1786
1787 // LHSSign -> LHS >= 0
1788 // RHSSign -> RHS >= 0
1789 // SumSign -> Sum >= 0
1790 //
1791 // Add:
1792 // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
1793 // Sub:
1794 // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
1795 InstructionCost Cost = 0;
1796 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1797 Cost += 3 * thisT()->getCmpSelInstrCost(
1798 Instruction::ICmp, SumTy, OverflowTy,
1799 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1800 Cost += 2 * thisT()->getCmpSelInstrCost(
1801 Instruction::Select, OverflowTy, OverflowTy,
1802 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1803 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy,
1804 CostKind);
1805 return Cost;
1806 }
1807 case Intrinsic::uadd_with_overflow:
1808 case Intrinsic::usub_with_overflow: {
1809 Type *SumTy = RetTy->getContainedType(0);
1810 Type *OverflowTy = RetTy->getContainedType(1);
1811 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
1812 ? BinaryOperator::Add
1813 : BinaryOperator::Sub;
1814
1815 InstructionCost Cost = 0;
1816 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1817 Cost +=
1818 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
1819 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1820 return Cost;
1821 }
1822 case Intrinsic::smul_with_overflow:
1823 case Intrinsic::umul_with_overflow: {
1824 Type *MulTy = RetTy->getContainedType(0);
1825 Type *OverflowTy = RetTy->getContainedType(1);
1826 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
1827 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
1828
1829 unsigned ExtOp =
1830 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1831 TTI::CastContextHint CCH = TTI::CastContextHint::None;
1832
1833 InstructionCost Cost = 0;
1834 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
1835 Cost +=
1836 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1837 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
1838 CCH, CostKind);
1839 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy,
1840 CostKind, TTI::OK_AnyValue,
1841 TTI::OK_UniformConstantValue);
1842
1843 if (IID == Intrinsic::smul_with_overflow)
1844 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
1845 CostKind, TTI::OK_AnyValue,
1846 TTI::OK_UniformConstantValue);
1847
1848 Cost +=
1849 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy,
1850 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1851 return Cost;
1852 }
1853 case Intrinsic::ctpop:
1854 ISDs.push_back(ISD::CTPOP);
1855 // In case of legalization use TCC_Expensive. This is cheaper than a
1856 // library call but still not a cheap instruction.
1857 SingleCallCost = TargetTransformInfo::TCC_Expensive;
1858 break;
1859 case Intrinsic::ctlz:
1860 ISDs.push_back(ISD::CTLZ);
1861 break;
1862 case Intrinsic::cttz:
1863 ISDs.push_back(ISD::CTTZ);
1864 break;
1865 case Intrinsic::bswap:
1866 ISDs.push_back(ISD::BSWAP);
1867 break;
1868 case Intrinsic::bitreverse:
1869 ISDs.push_back(ISD::BITREVERSE);
1870 break;
1871 }
1872
1873 const TargetLoweringBase *TLI = getTLI();
1874 std::pair<InstructionCost, MVT> LT =
1875 TLI->getTypeLegalizationCost(DL, RetTy);
1876
1877 SmallVector<InstructionCost, 2> LegalCost;
1878 SmallVector<InstructionCost, 2> CustomCost;
1879 for (unsigned ISD : ISDs) {
1880 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
1881 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
1882 TLI->isFAbsFree(LT.second)) {
1883 return 0;
1884 }
1885
1886 // The operation is legal. Assume it costs 1.
1887 // If the type is split to multiple registers, assume that there is some
1888 // overhead to this.
1889 // TODO: Once we have extract/insert subvector cost we need to use them.
1890 if (LT.first > 1)
1891 LegalCost.push_back(LT.first * 2);
1892 else
1893 LegalCost.push_back(LT.first * 1);
1894 } else if (!TLI->isOperationExpand(ISD, LT.second)) {
1895 // If the operation is custom lowered then assume
1896 // that the code is twice as expensive.
1897 CustomCost.push_back(LT.first * 2);
1898 }
1899 }
1900
1901 auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
1902 if (MinLegalCostI != LegalCost.end())
1903 return *MinLegalCostI;
1904
1905 auto MinCustomCostI =
1906 std::min_element(CustomCost.begin(), CustomCost.end());
1907 if (MinCustomCostI != CustomCost.end())
1908 return *MinCustomCostI;
1909
1910 // If we can't lower fmuladd into an FMA estimate the cost as a floating
1911 // point mul followed by an add.
1912 if (IID == Intrinsic::fmuladd)
1913 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
1914 CostKind) +
1915 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
1916 CostKind);
1917 if (IID == Intrinsic::experimental_constrained_fmuladd) {
1918 IntrinsicCostAttributes FMulAttrs(
1919 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
1920 IntrinsicCostAttributes FAddAttrs(
1921 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
1922 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
1923 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
1924 }
1925
1926 // Else, assume that we need to scalarize this intrinsic. For math builtins
1927 // this will emit a costly libcall, adding call overhead and spills. Make it
1928 // very expensive.
1929 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1930 // Scalable vectors cannot be scalarized, so return Invalid.
1931 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1932 return isa<ScalableVectorType>(Ty);
1933 }))
1934 return InstructionCost::getInvalid();
1935
1936 InstructionCost ScalarizationCost =
1937 SkipScalarizationCost ? ScalarizationCostPassed
1938 : getScalarizationOverhead(RetVTy, true, false);
1939
1940 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
1941 SmallVector<Type *, 4> ScalarTys;
1942 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1943 Type *Ty = Tys[i];
1944 if (Ty->isVectorTy())
1945 Ty = Ty->getScalarType();
1946 ScalarTys.push_back(Ty);
1947 }
1948 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
1949 InstructionCost ScalarCost =
1950 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1951 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1952 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
1953 if (!ICA.skipScalarizationCost())
1954 ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1955 ScalarCalls = std::max(ScalarCalls,
1956 cast<FixedVectorType>(VTy)->getNumElements());
1957 }
1958 }
1959 return ScalarCalls * ScalarCost + ScalarizationCost;
1960 }
1961
1962 // This is going to be turned into a library call, make it expensive.
1963 return SingleCallCost;
1964 }
1965
1966 /// Compute a cost of the given call instruction.
1967 ///
1968 /// Compute the cost of calling function F with return type RetTy and
1969 /// argument types Tys. F might be nullptr, in this case the cost of an
1970 /// arbitrary call with the specified signature will be returned.
1971 /// This is used, for instance, when we estimate call of a vector
1972 /// counterpart of the given function.
1973 /// \param F Called function, might be nullptr.
1974 /// \param RetTy Return value types.
1975 /// \param Tys Argument types.
1976 /// \returns The cost of Call instruction.
1977 InstructionCost
1978 getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys,
1979 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) {
1980 return 10;
1981 }
1982
1983 unsigned getNumberOfParts(Type *Tp) {
1984 std::pair<InstructionCost, MVT> LT =
1985 getTLI()->getTypeLegalizationCost(DL, Tp);
1986 return *LT.first.getValue();
1987 }
1988
1989 InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
1990 const SCEV *) {
1991 return 0;
1992 }
1993
1994 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
1995 /// We're assuming that reduction operation are performing the following way:
1996 ///
1997 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1998 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
1999 /// \----------------v-------------/ \----------v------------/
2000 /// n/2 elements n/2 elements
2001 /// %red1 = op <n x t> %val, <n x t> val1
2002 /// After this operation we have a vector %red1 where only the first n/2
2003 /// elements are meaningful, the second n/2 elements are undefined and can be
2004 /// dropped. All other operations are actually working with the vector of
2005 /// length n/2, not n, though the real vector length is still n.
2006 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2007 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2008 /// \----------------v-------------/ \----------v------------/
2009 /// n/4 elements 3*n/4 elements
2010 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
2011 /// length n/2, the resulting vector has length n/4 etc.
2012 ///
2013 /// The cost model should take into account that the actual length of the
2014 /// vector is reduced on each iteration.
2015 InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
2016 TTI::TargetCostKind CostKind) {
2017 Type *ScalarTy = Ty->getElementType();
2018 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2019 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2020 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2021 NumVecElts >= 2) {
2022 // Or reduction for i1 is represented as:
2023 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2024 // %res = cmp ne iReduxWidth %val, 0
2025 // And reduction for i1 is represented as:
2026 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2027 // %res = cmp eq iReduxWidth %val, 11111
2028 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2029 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2030 TTI::CastContextHint::None, CostKind) +
2031 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2032 CmpInst::makeCmpResultType(ValTy),
2033 CmpInst::BAD_ICMP_PREDICATE, CostKind);
2034 }
2035 unsigned NumReduxLevels = Log2_32(NumVecElts);
2036 InstructionCost ArithCost = 0;
2037 InstructionCost ShuffleCost = 0;
2038 std::pair<InstructionCost, MVT> LT =
2039 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2040 unsigned LongVectorCount = 0;
2041 unsigned MVTLen =
2042 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2043 while (NumVecElts > MVTLen) {
2044 NumVecElts /= 2;
2045 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2046 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2047 NumVecElts, SubTy);
2048 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2049 Ty = SubTy;
2050 ++LongVectorCount;
2051 }
2052
2053 NumReduxLevels -= LongVectorCount;
2054
2055 // The minimal length of the vector is limited by the real length of vector
2056 // operations performed on the current platform. That's why several final
2057 // reduction operations are performed on the vectors with the same
2058 // architecture-dependent length.
2059
2060 // By default reductions need one shuffle per reduction level.
2061 ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
2062 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2063 ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty);
2064 return ShuffleCost + ArithCost +
2065 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2066 }
2067
2068 /// Try to calculate the cost of performing strict (in-order) reductions,
2069 /// which involves doing a sequence of floating point additions in lane
2070 /// order, starting with an initial value. For example, consider a scalar
2071 /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2072 ///
2073 /// Vector = <float %v0, float %v1, float %v2, float %v3>
2074 ///
2075 /// %add1 = %InitVal + %v0
2076 /// %add2 = %add1 + %v1
2077 /// %add3 = %add2 + %v2
2078 /// %add4 = %add3 + %v3
2079 ///
2080 /// As a simple estimate we can say the cost of such a reduction is 4 times
2081 /// the cost of a scalar FP addition. We can only estimate the costs for
2082 /// fixed-width vectors here because for scalable vectors we do not know the
2083 /// runtime number of operations.
2084 InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty,
2085 TTI::TargetCostKind CostKind) {
2086 // Targets must implement a default value for the scalable case, since
2087 // we don't know how many lanes the vector has.
2088 if (isa<ScalableVectorType>(Ty))
2089 return InstructionCost::getInvalid();
2090
2091 auto *VTy = cast<FixedVectorType>(Ty);
2092 InstructionCost ExtractCost =
2093 getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true);
2094 InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2095 Opcode, VTy->getElementType(), CostKind);
2096 ArithCost *= VTy->getNumElements();
2097
2098 return ExtractCost + ArithCost;
2099 }
2100
2101 InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2102 Optional<FastMathFlags> FMF,
2103 TTI::TargetCostKind CostKind) {
2104 if (TTI::requiresOrderedReduction(FMF))
2105 return getOrderedReductionCost(Opcode, Ty, CostKind);
2106 return getTreeReductionCost(Opcode, Ty, CostKind);
2107 }
2108
2109 /// Try to calculate op costs for min/max reduction operations.
2110 /// \param CondTy Conditional type for the Select instruction.
2111 InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
2112 bool IsUnsigned,
2113 TTI::TargetCostKind CostKind) {
2114 Type *ScalarTy = Ty->getElementType();
2115 Type *ScalarCondTy = CondTy->getElementType();
2116 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
5
'Ty' is a 'FixedVectorType'
2117 unsigned NumReduxLevels = Log2_32(NumVecElts);
2118 unsigned CmpOpcode;
2119 if (Ty->isFPOrFPVectorTy()) {
6
Taking false branch
2120 CmpOpcode = Instruction::FCmp;
2121 } else {
2122 assert(Ty->isIntOrIntVectorTy() &&((void)0)
2123 "expecting floating point or integer type for min/max reduction")((void)0);
2124 CmpOpcode = Instruction::ICmp;
2125 }
2126 InstructionCost MinMaxCost = 0;
2127 InstructionCost ShuffleCost = 0;
2128 std::pair<InstructionCost, MVT> LT =
2129 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2130 unsigned LongVectorCount = 0;
2131 unsigned MVTLen =
2132 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
7
'?' condition is false
2133 while (NumVecElts > MVTLen) {
8
Assuming 'NumVecElts' is > 'MVTLen'
9
Loop condition is true. Entering loop body
2134 NumVecElts /= 2;
2135 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2136 CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
10
Value assigned to 'CondTy'
2137
2138 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2139 NumVecElts, SubTy);
2140 MinMaxCost +=
2141 thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
11
Passing 'CondTy' via 3rd parameter 'CondTy'
12
Calling 'BasicTTIImplBase::getCmpSelInstrCost'
2142 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2143 thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
2144 CmpInst::BAD_ICMP_PREDICATE, CostKind);
2145 Ty = SubTy;
2146 ++LongVectorCount;
2147 }
2148
2149 NumReduxLevels -= LongVectorCount;
2150
2151 // The minimal length of the vector is limited by the real length of vector
2152 // operations performed on the current platform. That's why several final
2153 // reduction opertions are perfomed on the vectors with the same
2154 // architecture-dependent length.
2155 ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
2156 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2157 MinMaxCost +=
2158 NumReduxLevels *
2159 (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
2160 CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2161 thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
2162 CmpInst::BAD_ICMP_PREDICATE, CostKind));
2163 // The last min/max should be in vector registers and we counted it above.
2164 // So just need a single extractelement.
2165 return ShuffleCost + MinMaxCost +
2166 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2167 }
2168
2169 InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
2170 Type *ResTy, VectorType *Ty,
2171 TTI::TargetCostKind CostKind) {
2172 // Without any native support, this is equivalent to the cost of
2173 // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
2174 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2175 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2176 Instruction::Add, ExtTy, None, CostKind);
2177 InstructionCost MulCost = 0;
2178 InstructionCost ExtCost = thisT()->getCastInstrCost(
2179 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2180 TTI::CastContextHint::None, CostKind);
2181 if (IsMLA) {
2182 MulCost =
2183 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2184 ExtCost *= 2;
2185 }
2186
2187 return RedCost + MulCost + ExtCost;
2188 }
2189
2190 InstructionCost getVectorSplitCost() { return 1; }
2191
2192 /// @}
2193};
2194
2195/// Concrete BasicTTIImpl that can be used if no further customization
2196/// is needed.
2197class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2198 using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2199
2200 friend class BasicTTIImplBase<BasicTTIImpl>;
2201
2202 const TargetSubtargetInfo *ST;
2203 const TargetLoweringBase *TLI;
2204
2205 const TargetSubtargetInfo *getST() const { return ST; }
2206 const TargetLoweringBase *getTLI() const { return TLI; }
2207
2208public:
2209 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2210};
2211
2212} // end namespace llvm
2213
2214#endif // LLVM_CODEGEN_BASICTTIIMPL_H